<a href="https://colab.research.google.com/github/ayush9818/brain-tumour-detection/blob/main/Data_Preparation_for_Brain_Tumour_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
import os
import tensorflow as tf
from keras.preprocessing.image import ImageDataGenerator
import cv2
import imutils
import matplotlib.pyplot as plt
from os import listdir
import time    
import pandas as pd
%matplotlib inline

In [None]:
data_path = '/content/drive/MyDrive/brain_tumor_dataset/'
assert os.path.exists(data_path)

In [None]:
os.listdir(data_path)

['no', 'yes', 'augmented_data', 'df.csv', 'model_training']

In [None]:
# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return f"{h}:{m}:{round(s,1)}"

In [None]:
def augment_data(file_dir, n_generated_samples, save_to_dir):
  
  data_gen = ImageDataGenerator(rotation_range=10, 
                                  width_shift_range=0.1, 
                                  height_shift_range=0.1, 
                                  shear_range=0.1, 
                                  brightness_range=(0.3, 1.0),
                                  horizontal_flip=True, 
                                  vertical_flip=True, 
                                  fill_mode='nearest'
                                   )
  
  if os.path.exists(save_to_dir) == False:
    os.makedirs(save_to_dir)

    
  for filename in listdir(file_dir):
      image = cv2.imread(os.path.join(file_dir,filename))
      image = image.reshape((1,)+image.shape)
      save_prefix = 'aug_' + filename[:-4]
      i=0
      for batch in data_gen.flow(x=image, batch_size=1, save_to_dir=save_to_dir, 
                                          save_prefix=save_prefix, save_format='jpg'):
          i += 1
          if i > n_generated_samples:
              break

In [None]:
for cls in listdir(data_path):
  print(listdir(os.path.join(data_path,cls))[0])

11 no.jpg
Y58.JPG


In [None]:
save_path = '/content/drive/MyDrive/brain_tumor_dataset/augmented_data/'
start_time = time.time()
# augment yes class
augment_data(file_dir = data_path+'yes',
             n_generated_samples = 6, 
             save_to_dir = save_path+'yes')

# augment no class
augment_data(file_dir = data_path+'no',
             n_generated_samples = 9, 
             save_to_dir = save_path+'no')

end_time = time.time()
execution_time = (end_time - start_time)
print(f"Elapsed time: {hms_string(execution_time)}")

Elapsed time: 0:1:52.5


In [None]:
def create_data_summary(data_path):
  yes_path=data_path+'yes'
  no_path =data_path+'no'

  yes_samples = len(listdir(yes_path))
  no_samples  = len(listdir(no_path))

  total_samples = yes_samples + no_samples

  print(f'No of Samples : {total_samples}')
  print("Total Samples with Tumour : {}, Percentage : {}".format(yes_samples, yes_samples / total_samples))
  print("Total Samples without Tumour : {}, Percentage : {}".format(no_samples, no_samples / total_samples))

In [None]:
create_data_summary(save_path)

No of Samples : 2064
Total Samples with Tumour : 1085, Percentage : 0.5256782945736435
Total Samples without Tumour : 979, Percentage : 0.4743217054263566


In [None]:
import pandas as pd
import glob
import random

split_dict = {'train' : 0.7, 'test' : 0.15, 'valid': 0.15}



def get_train_test_valid_split(image_path_list):
  total = len(image_path_list)
  test = int(split_dict['test'] * total)
  valid = int((total-test) * split_dict['valid'])
  train = total-test-valid
  assert total == train+test+valid
  out=['train']*train+['test']*test+['valid']*valid
  return out

def create_df(dataset_path):
  df = pd.DataFrame(columns=['image_path','image_name','type','label'])
  for cls in listdir(dataset_path):
    path = os.path.join(os.path.join(save_path,cls),"*.jpg")
    image_path_list=glob.glob(path)
    random.shuffle(image_path_list)
    type_ = get_train_test_valid_split(image_path_list)
    image_name = [  image_path.split('/')[-1]    for image_path in image_path_list]
    df = pd.concat([df, pd.DataFrame({'image_path' : image_path_list,
                                      'image_name' : image_name, 
                                      'type' : type_,
                                      'label' : [cls] * len(image_path_list) })])
  return df

In [None]:
df = create_df(save_path)

In [None]:
df.to_csv('/content/drive/MyDrive/brain_tumor_dataset/df.csv')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/brain_tumor_dataset/df.csv')
train_df = df[df['type']=='train']
print(train_df.shape)

valid_df = df[df['type']=='valid']
print(valid_df.shape)

train_df['is_valid']=False
valid_df['is_valid'] = True

combined_df = pd.concat([train_df,valid_df])
combined_df.head()

(1494, 5)
(262, 5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Unnamed: 0.1,Unnamed: 0,image_path,image_name,type,label,is_valid
0,0,/content/drive/MyDrive/brain_tumor_dataset/aug...,aug_Y7_0_745.jpg,train,yes,False
1,1,/content/drive/MyDrive/brain_tumor_dataset/aug...,aug_Y81_0_3635.jpg,train,yes,False
2,2,/content/drive/MyDrive/brain_tumor_dataset/aug...,aug_Y160_0_1252.jpg,train,yes,False
3,3,/content/drive/MyDrive/brain_tumor_dataset/aug...,aug_Y3_0_9109.jpg,train,yes,False
4,4,/content/drive/MyDrive/brain_tumor_dataset/aug...,aug_Y242_0_5993.jpg,train,yes,False


In [None]:
print(combined_df.shape)

(1756, 6)
