# Task A: Test Data Preprocessing

## 1. Preprocess the label file of test data

In [10]:
# Import necessary libraries
import numpy as np
import pandas as pd

In [11]:
# Read label (.csv file) of dataset.
label_csv = pd.read_csv('.\\test\label.csv')

# Print basic information of label, notice that it contains 4 classes based on tumor type. 
print(label_csv.info(), '\n')
print(label_csv.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   file_name  200 non-null    object
 1   label      200 non-null    object
dtypes: object(2)
memory usage: 3.2+ KB
None 

        file_name             label
0  IMAGE_0000.jpg  meningioma_tumor
1  IMAGE_0001.jpg  meningioma_tumor
2  IMAGE_0002.jpg      glioma_tumor
3  IMAGE_0003.jpg   pituitary_tumor
4  IMAGE_0004.jpg      glioma_tumor


In [12]:
# For binary task classification, we simply use "0" and "1" to indicate and identify whether there is a tumor in the MRI images. 
label_csv['new'] = 1
label_csv['new'][label_csv['label']=='no_tumor'] = 0

# Check result.
label_csv.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,file_name,label,new
0,IMAGE_0000.jpg,meningioma_tumor,1
1,IMAGE_0001.jpg,meningioma_tumor,1
2,IMAGE_0002.jpg,glioma_tumor,1
3,IMAGE_0003.jpg,pituitary_tumor,1
4,IMAGE_0004.jpg,glioma_tumor,1


In [13]:
# Transfer label into array dataformat.
label_TaskA = np.array(label_csv['new']).astype('uint8')

# Check result.
label_TaskA.shape, label_TaskA.dtype

((200,), dtype('uint8'))

## 2. Preprocess MRI test images

In [14]:
# Import necessary libraries
import cv2
from glob import glob

In [15]:
# Read 3000 512x512 pixel gray-scale MRI images
images = [cv2.imread(image, cv2.IMREAD_GRAYSCALE) 
          for image in glob('test/image/*.jpg')]


### Uncomment the following code to check MRI image.
# cv2.imshow('MRI Image Test', images[0])
# cv2.waitKey(0)
# cv2.destroyAllWindows()

In [16]:
# Median filter is used to remove noise from the MRI images. 
images_blur = [cv2.medianBlur(images[i], 5)
               for i in range(0, len(images))]


### Uncomment the following code to check filtered MRI image.
# cv2.imshow('MRI Image After Median Filter Test', images_blur[0])
# cv2.waitKey(0)
# cv2.destroyAllWindows()

In [17]:
# Enhance the contrast of the picture.
alpha = 1.5 # Contrast control (1.0-3.0)
beta = 0 # Brightness control (0-100)

images_adjusted = [cv2.convertScaleAbs(images_blur[i], alpha=alpha, beta=beta)
                   for i in range(0, len(images))]

# Normalize pictures by using Maximum and minimum normalization.
images_normlize = [cv2.normalize(images_adjusted[i], None, alpha=0,beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F) 
                   for i in range(0, len(images))]


### Uncomment the following code to check image after enhancement.
# cv2.imshow('MRI Image After Median Filter Test', images_adjusted[0])
# cv2.waitKey(0)
# cv2.destroyAllWindows()

In [18]:
# Resize images into 128*128 pixel
resize = 128
images_resized = [cv2.resize(images_normlize[i], (resize,resize))
                  for i in range(0, len(images))]

# Transfer images into array dataformat.
images_TaskA = np.array(images_resized)

# Print image infotmation
print('{} images with {}x{} pixel. \n'.format(images_TaskA.shape[0], 
                                              images_TaskA.shape[1], 
                                              images_TaskA.shape[2]))

# Vectorize images. 
images_TaskA = images_TaskA.reshape((200, resize*resize))

# Print vectorized image information
print('After vectorization')
print('{} vectorized images with {}(512^2) pixel.'.format(images_TaskA.shape[0], 
                                                          images_TaskA.shape[1]))

200 images with 128x128 pixel. 

After vectorization
200 vectorized images with 16384(512^2) pixel.


## 3. Implement PCA into test data

In [22]:
# Import necessary libraries
import pickle

In [24]:
# Load PCA model trained with dataset
with open('Model/DataProcess_PCA.pickle', 'rb') as handle:
    pca_TaskA = pickle.load(handle)

# Implement PCA model into test images to reduce the data dimension
images_PCA = pca_TaskA.transform(images_TaskA)

## 4. Save preprocessed test data

In [21]:
# Generate new data array of preprocessed data.
images_AfterProcess = images_PCA.copy()
label_AfterProcess = label_TaskA.copy()

# Check preprocessed data.
images_AfterProcess.shape, label_AfterProcess.shape

((200, 200), (200,))

In [14]:
# Save preprocessed test data with help of pickle.
with open('DataAfterProcess/test_images_AfterProcess.pickle', 'wb') as handle:
    pickle.dump(images_AfterProcess, handle)
    
with open('DataAfterProcess/test_label_AfterProcess.pickle', 'wb') as handle:
    pickle.dump(label_AfterProcess, handle)