## RGB Image SegNet Segmentation

In [1]:
import os
import cv2
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
from PIL import Image, ImageOps
from tensorflow.keras.models import load_model
from custom_functions import dice_loss, dice_coefficient, iou

In [2]:
folder_path = r'C:\Users\BoonJane\Desktop\oralcancer\RGBimage' 
class_names = ['Normal', 'Hyperchromasia', 'Atypical mitotic figure', 'Abnormal variation in nuclear shape'] 

In [3]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
data = pd.DataFrame(columns=['image_path', 'label'])

for class_index, class_name in enumerate(class_names):
    class_folder = os.path.join(folder_path, class_name)
    files = os.listdir(class_folder)

    # run all files and add it into dataframe
    for file_name in files:
        file_path = os.path.join(class_folder, file_name)
        data = data.append({'image_path': file_path, 'label': class_index}, ignore_index=True)

In [5]:
print(data.head(220)) 

                                            image_path label
0    C:\Users\BoonJane\Desktop\oralcancer\RGBimage\...     0
1    C:\Users\BoonJane\Desktop\oralcancer\RGBimage\...     0
2    C:\Users\BoonJane\Desktop\oralcancer\RGBimage\...     0
3    C:\Users\BoonJane\Desktop\oralcancer\RGBimage\...     0
4    C:\Users\BoonJane\Desktop\oralcancer\RGBimage\...     0
..                                                 ...   ...
215  C:\Users\BoonJane\Desktop\oralcancer\RGBimage\...     3
216  C:\Users\BoonJane\Desktop\oralcancer\RGBimage\...     3
217  C:\Users\BoonJane\Desktop\oralcancer\RGBimage\...     3
218  C:\Users\BoonJane\Desktop\oralcancer\RGBimage\...     3
219  C:\Users\BoonJane\Desktop\oralcancer\RGBimage\...     3

[220 rows x 2 columns]


In [6]:
data.to_csv('datargb.csv', index=False)

In [7]:
# Open an image and a mask
image_path = r"C:\Users\BoonJane\Desktop\oralcancer\RGBimage\Abnormal variation in nuclear shape\image_165.jpg"

image = Image.open(image_path)

# Get the size of the image and mask using the size attribute
image_size = image.size

print("Image size:", image_size)

# Convert the images to NumPy arrays
image_array = np.array(image)

# Get the shape of the arrays using the shape attribute
image_shape = image_array.shape

print("Image shape:", image_shape)


Image size: (256, 256)
Image shape: (256, 256, 3)


In [8]:
custom_objects = {
    'dice_loss': dice_loss,
    'dice_coefficient': dice_coefficient,
    'iou': iou
}

model = load_model('segnetrgb.h5', custom_objects=custom_objects)

In [9]:
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 256, 256, 3)]     0         
                                                                 
 conv2d_9 (Conv2D)           (None, 256, 256, 64)      1792      
                                                                 
 conv2d_10 (Conv2D)          (None, 256, 256, 64)      36928     
                                                                 
 max_pooling2d_2 (MaxPooling  (None, 128, 128, 64)     0         
 2D)                                                             
                                                                 
 conv2d_11 (Conv2D)          (None, 128, 128, 128)     73856     
                                                                 
 conv2d_12 (Conv2D)          (None, 128, 128, 128)     147584    
                                                           

In [10]:
model.input

<KerasTensor: shape=(None, 256, 256, 3) dtype=float32 (created by layer 'input_2')>

In [11]:
model.output

<KerasTensor: shape=(None, 256, 256, 3) dtype=float32 (created by layer 'conv2d_17')>

In [12]:
data = pd.read_csv('datargb.csv')

In [13]:
def preprocess_image(image):
    # resize (make sure input image is 256*256)
    resized_image = cv2.resize(image, (256, 256))

    # normalized
    normalized_image = resized_image / 255.0

    # adjust image shape to (1, 256, 256, 3)
    processed_image = np.expand_dims(normalized_image, axis=0)  

    return processed_image

In [14]:
def postprocess_image(segmented_image, threshold=0.5):
    # convert from 0 and 1 to 0 and 255 (able to see the mask)
    binary_image = np.where(segmented_image >= threshold, 0, 255).astype(np.uint8)
    return binary_image

In [15]:
output_folder = r"C:\Users\BoonJane\Desktop\oralcancer\segnetrgb\segmented_mask"
output_folder2 = r"C:\Users\BoonJane\Desktop\oralcancer\segnetrgb\postprocessed_mask"
os.makedirs(output_folder, exist_ok=True)
os.makedirs(output_folder2, exist_ok=True)

In [16]:
class_mapping = {
    0: 'Normal',
    1: 'Hyperchromasia',
    2: 'Atypical mitotic figure',
    3: 'Abnormal variation in nuclear shape'
}

In [17]:
for index, row in data.iterrows():
    image_path = row['image_path']
    label = row['label']

    image = cv2.imread(image_path)

    preprocessed_image = preprocess_image(image)
    print('preprocessed image shape:', preprocessed_image.shape)

    segmented_mask = model.predict(preprocessed_image) 
    print(segmented_mask.shape)

    postprocessed_mask = postprocess_image(segmented_mask, threshold=0.7)
    print(postprocessed_mask.shape)
    
    # save the segmented image to local folder
    for i, mask in enumerate(segmented_mask):
        class_name = class_mapping[label]  
        label_folder = os.path.join(output_folder, class_name)
        os.makedirs(label_folder, exist_ok=True)

        print(mask.shape)

        save_path = os.path.join(label_folder, f'{class_name}_{index}_{i}.jpg')
        cv2.imwrite(save_path, mask)
        
    # save the postprocessed mask to local folder
    for i, mask in enumerate(postprocessed_mask):
        class_name = class_mapping[label]  
        label_folder = os.path.join(output_folder2, class_name)
        os.makedirs(label_folder, exist_ok=True)

        print(mask.shape)

        save_path = os.path.join(label_folder, f'{class_name}_{index}_{i}.jpg')
        cv2.imwrite(save_path, mask)

preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256

(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 2

(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 2

(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 2

(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 256, 3)
preprocessed image shape: (1, 256, 256, 3)
(1, 256, 256, 3)
(1, 256, 256, 3)
(256, 256, 3)
(256, 2

## Data Preparation for Classification

In [1]:
import os
import cv2
import numpy as np
import tensorflow as tf
import pandas as pd
import warnings
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from keras.models import load_model
from skimage.feature import local_binary_pattern
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer

In [2]:
folder_path = r"C:\Users\BoonJane\Desktop\oralcancer\segnetrgb\segmented_mask" 
folder2_path = r"C:\Users\BoonJane\Desktop\oralcancer\segnetrgb\postprocessed_mask" 
class_names = ['Normal', 'Hyperchromasia', 'Atypical mitotic figure', 'Abnormal variation in nuclear shape'] 

In [3]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
segmenteddata = pd.DataFrame(columns=['segmentedmask_path', 'class'])

for class_index, class_name in enumerate(class_names):
    class_folder = os.path.join(folder_path, class_name)
    files = os.listdir(class_folder)

    # run all files and add it into dataframe
    for file_name in files:
        file_path = os.path.join(class_folder, file_name)
        segmenteddata = segmenteddata.append({'segmentedmask_path': file_path, 'class': class_index}, ignore_index=True)

In [5]:
postprocesseddata = pd.DataFrame(columns=['postprocessedmask_path', 'class'])

for class_index, class_name in enumerate(class_names):
    class_folder = os.path.join(folder2_path, class_name)
    files = os.listdir(class_folder)

    # run all files and add it into dataframe
    for file_name in files:
        file_path = os.path.join(class_folder, file_name)
        postprocesseddata = postprocesseddata.append({'postprocessedmask_path': file_path, 'class': class_index}, ignore_index=True)

In [6]:
segmenteddata.head(220)

Unnamed: 0,segmentedmask_path,class
0,C:\Users\BoonJane\Desktop\oralcancer\segnetrgb...,0
1,C:\Users\BoonJane\Desktop\oralcancer\segnetrgb...,0
2,C:\Users\BoonJane\Desktop\oralcancer\segnetrgb...,0
3,C:\Users\BoonJane\Desktop\oralcancer\segnetrgb...,0
4,C:\Users\BoonJane\Desktop\oralcancer\segnetrgb...,0
...,...,...
215,C:\Users\BoonJane\Desktop\oralcancer\segnetrgb...,3
216,C:\Users\BoonJane\Desktop\oralcancer\segnetrgb...,3
217,C:\Users\BoonJane\Desktop\oralcancer\segnetrgb...,3
218,C:\Users\BoonJane\Desktop\oralcancer\segnetrgb...,3


In [7]:
postprocesseddata.head(220)

Unnamed: 0,postprocessedmask_path,class
0,C:\Users\BoonJane\Desktop\oralcancer\segnetrgb...,0
1,C:\Users\BoonJane\Desktop\oralcancer\segnetrgb...,0
2,C:\Users\BoonJane\Desktop\oralcancer\segnetrgb...,0
3,C:\Users\BoonJane\Desktop\oralcancer\segnetrgb...,0
4,C:\Users\BoonJane\Desktop\oralcancer\segnetrgb...,0
...,...,...
215,C:\Users\BoonJane\Desktop\oralcancer\segnetrgb...,3
216,C:\Users\BoonJane\Desktop\oralcancer\segnetrgb...,3
217,C:\Users\BoonJane\Desktop\oralcancer\segnetrgb...,3
218,C:\Users\BoonJane\Desktop\oralcancer\segnetrgb...,3


## Feature Extraction

In [8]:
def calculate_area(postprocessed_mask):
    contours, _ = cv2.findContours(postprocessed_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    areas = np.array([cv2.contourArea(contour) for contour in contours])
    if len(areas) < max_contour_count:
        areas = np.concatenate([areas, default_area[len(areas):]])
    return areas

In [9]:
def calculate_perimeter(postprocessed_mask):
    contours, _ = cv2.findContours(postprocessed_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    perimeters = np.array([cv2.arcLength(contour, closed=True) for contour in contours])
    if len(perimeters) < max_contour_count:
        perimeters = np.concatenate([perimeters, default_perimeter[len(perimeters):]])
    return perimeters

In [10]:
def calculate_circularity(postprocessed_mask):
    perimeters = calculate_perimeter(postprocessed_mask)
    areas = calculate_area(postprocessed_mask)
    circularities = []
    for perimeter, area in zip(perimeters, areas):
        if perimeter == 0:
            circularity = 0
        else:
            circularity = (4 * np.pi * area) / (perimeter ** 2)
        circularities.append(circularity)
    circularities = np.array(circularities)
    if len(circularities) < max_contour_count:
        circularities = np.concatenate([circularities, default_circularity[len(circularities):]])
    return circularities

In [11]:
def calculate_color(postprocessed_mask):
    if postprocessed_mask.ndim == 3 and postprocessed_mask.shape[2] == 3:
        rgb_image = postprocessed_mask
    else:
        rgb_image = cv2.cvtColor(postprocessed_mask, cv2.COLOR_GRAY2RGB)
        
    pixels = rgb_image.reshape(-1, 3)  
    histogram, _ = np.histogramdd(pixels, bins=(8, 8, 8), range=[(0, 255), (0, 255), (0, 255)])
    color_feature = histogram.flatten() / np.sum(histogram)  
    
    return color_feature

In [12]:
max_contour_count = 426
default_area = np.zeros((max_contour_count,))
default_perimeter = np.zeros((max_contour_count,))
default_circularity = np.zeros((max_contour_count,))

In [13]:
postprocesseddata['class'] = postprocesseddata['class'].astype('int')

In [14]:
# extract feature (postprocesseddata)
def extract_area_features(postprocessedmask_path):
    postprocessed_mask = cv2.imread(postprocessedmask_path, 0)
    area = calculate_area(postprocessed_mask)
    return area.flatten()

def extract_perimeter_features(postprocessedmask_path):
    postprocessed_mask = cv2.imread(postprocessedmask_path, 0)
    perimeter = calculate_perimeter(postprocessed_mask)
    return perimeter.flatten()

def extract_circularity_features(postprocessedmask_path):
    postprocessed_mask = cv2.imread(postprocessedmask_path, 0)
    circularity = calculate_circularity(postprocessed_mask)
    return circularity.flatten()

def extract_color_features(postprocessedmask_path):
    postprocessed_mask = cv2.imread(postprocessedmask_path)
    color = calculate_color(postprocessed_mask)
    return color.flatten()

# apply function on the image
area_features = postprocesseddata['postprocessedmask_path'].apply(extract_area_features)
perimeter_features = postprocesseddata['postprocessedmask_path'].apply(extract_perimeter_features)
circularity_features = postprocesseddata['postprocessedmask_path'].apply(extract_circularity_features)
color_features = postprocesseddata['postprocessedmask_path'].apply(extract_color_features)

# make each number in the array separate into each column
area_features = np.vstack(area_features)
perimeter_features = np.vstack(perimeter_features)
circularity_features = np.vstack(circularity_features)
color_features = np.vstack(color_features)

# name the columns
area_columns = [f"area_{i}" for i in range(area_features.shape[1])]
perimeter_columns = [f"perimeter_{i}" for i in range(perimeter_features.shape[1])]
circularity_columns = [f"circularity_{i}" for i in range(circularity_features.shape[1])]
color_columns = [f"color_{i}" for i in range(color_features.shape[1])]

area_data=pd.DataFrame(area_features, columns=area_columns)
perimeter_data=pd.DataFrame(perimeter_features, columns=perimeter_columns)
circularity_data=pd.DataFrame(circularity_features, columns=circularity_columns)
color_data=pd.DataFrame(color_features, columns=color_columns)

# find the column which is 0 and delete the column
zero_columns = area_data.columns[(area_data == 0).all()]
area_data = area_data.loc[:, ~area_data.columns.isin(zero_columns)]

zero2_columns = perimeter_data.columns[(perimeter_data == 0).all()]
perimeter_data = perimeter_data.loc[:, ~perimeter_data.columns.isin(zero_columns)]

zero3_columns = circularity_data.columns[(circularity_data == 0).all()]
circularity_data = circularity_data.loc[:, ~circularity_data.columns.isin(zero_columns)]

zero_columns = color_data.columns[(color_data == 0).all()]
color_data = color_data.loc[:, ~color_data.columns.isin(zero_columns)]

area_data.to_csv('areargb_segnetdata.csv', index=False)
perimeter_data.to_csv('perimeterrgb_segnetdata.csv', index=False)
circularity_data.to_csv('circularityrgb_segnetdata.csv', index=False)
color_data.to_csv('colorrgb_segnetdata.csv', index=False)

In [15]:
def calculate_texture(segmented_mask):
    if segmented_mask.ndim == 3 and segmented_mask.shape[2] == 3:
        gray_image = cv2.cvtColor(segmented_mask, cv2.COLOR_RGB2GRAY)
    else:
        gray_image = segmented_mask

    if gray_image.ndim != 2:
        gray_image = gray_image[:, :, 0]

    lbp = local_binary_pattern(gray_image, 8, 1, method='uniform')
    histogram, _ = np.histogram(lbp, bins=np.arange(0, 9), range=(0, 8))
    texture = histogram / np.sum(histogram)
    return texture

In [16]:
segmenteddata['class'] = segmenteddata['class'].astype('int')

In [17]:
# extract feature (segmented data)
def extract_texture_features(segmentedmask_path):
    segmented_mask = cv2.imread(segmentedmask_path, 0)
    texture = calculate_texture(segmented_mask)
    return texture.flatten()

# apply function on the image
texture_features = segmenteddata['segmentedmask_path'].apply(extract_texture_features)

# make each number in the array separate into each column
texture_features = np.vstack(texture_features)

# name the columns
texture_columns = [f"texture_{i}" for i in range(texture_features.shape[1])]

texture_data=pd.DataFrame(texture_features, columns=texture_columns)

texture_data.to_csv('texturergb_segnetdata.csv', index=False)

## Feature Data for Classification

In [18]:
# combine all the features data into dataframe
data = pd.concat([area_data, perimeter_data, circularity_data, texture_data, color_data], axis=1)
labels = segmenteddata['class']

print()
data.head(5)




Unnamed: 0,area_0,area_1,area_2,area_3,area_4,area_5,area_6,area_7,area_8,area_9,...,color_500,color_503,color_504,color_505,color_506,color_507,color_508,color_509,color_510,color_511
0,0.0,0.0,1.0,4.0,0.0,0.0,0.0,0.0,0.5,0.5,...,0.0,0.0,0.000366,7.6e-05,1.5e-05,0.0,0.0,0.0,4.6e-05,0.113174
1,0.0,0.0,46.5,2.5,64.5,23.0,0.0,113.0,0.0,0.0,...,0.0,0.0,3.1e-05,1.5e-05,0.0,0.0,0.0,0.0,0.0,0.25
2,0.0,0.0,0.0,0.0,0.0,0.0,7441.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000153,0.236664
3,0.5,0.0,0.0,20.5,0.0,82.5,0.0,4.5,1.0,0.0,...,0.0,0.0,0.000305,0.000107,0.0,0.0,1.5e-05,7.6e-05,7.6e-05,0.117691
4,6381.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.108963


In [19]:
# training set, testing set and validation set
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.4, random_state=42)
val_data, test_data, val_labels, test_labels = train_test_split(test_data, test_labels, test_size=0.5, random_state=42)

In [20]:
# check on training set, testing set and validation set
print("Train Data:")
train_data.head(132)

Train Data:


Unnamed: 0,area_0,area_1,area_2,area_3,area_4,area_5,area_6,area_7,area_8,area_9,...,color_500,color_503,color_504,color_505,color_506,color_507,color_508,color_509,color_510,color_511
146,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.034317,0.004257,0.000626,0.0,0.000015,0.001968,0.004822,0.196640
173,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,1.0,1.5,...,0.000000,0.0,0.001633,0.000031,0.000015,0.0,0.000000,0.000031,0.000046,0.128983
206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.001892,0.000122,0.000015,0.0,0.000000,0.000046,0.000061,0.199356
0,0.0,0.0,1.0,4.0,0.0,0.0,0.0,0.0,0.5,0.5,...,0.000000,0.0,0.000366,0.000076,0.000015,0.0,0.000000,0.000000,0.000046,0.113174
2,0.0,0.0,0.0,0.0,0.0,0.0,7441.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000153,0.236664
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,54.5,...,0.000000,0.0,0.039429,0.002792,0.000366,0.0,0.000122,0.001968,0.003571,0.063736
14,0.0,14563.0,0.0,18.0,33.0,0.0,0.0,0.0,14934.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000031,0.453156
92,5610.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,...,0.000015,0.0,0.027420,0.006348,0.000671,0.0,0.000214,0.005615,0.013351,0.325211
179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,...,0.000000,0.0,0.000443,0.000000,0.000000,0.0,0.000000,0.000000,0.000031,0.312332


In [21]:
print("Test Data:")
test_data.head(44)

Test Data:


Unnamed: 0,area_0,area_1,area_2,area_3,area_4,area_5,area_6,area_7,area_8,area_9,...,color_500,color_503,color_504,color_505,color_506,color_507,color_508,color_509,color_510,color_511
84,0.0,0.0,0.0,0.5,0.0,0.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.04686,0.001785,0.000153,0.0,4.6e-05,0.001099,0.002457,0.151352
132,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.0,9.0,...,0.0,0.0,0.019669,0.001877,0.000168,0.0,1.5e-05,0.000946,0.002487,0.266937
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.1e-05,0.022842,0.002899,0.000259,3.1e-05,0.000488,0.003632,0.006836,0.181442
210,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.00972,0.000336,4.6e-05,0.0,3.1e-05,0.000122,0.000397,0.031937
190,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.001923,7.6e-05,1.5e-05,0.0,3.1e-05,3.1e-05,0.001221,0.655777
65,3570.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,...,0.0,1.5e-05,0.035141,0.006424,0.000732,0.0,0.000137,0.00354,0.008698,0.220932
100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.073273,0.00235,0.000168,0.0,1.5e-05,0.001038,0.003021,0.095261
197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,3.0,...,0.0,0.0,0.010361,0.000153,0.0,0.0,1.5e-05,3.1e-05,0.000671,0.073425
15,3834.0,2.0,3.0,0.0,0.0,0.0,2.5,0.0,13.5,12.0,...,0.0,0.0,0.000534,4.6e-05,0.0,0.0,0.0,0.0,0.0,0.121078
112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.5,0.0,0.0,...,0.0,1.5e-05,0.005371,0.001816,0.000244,0.0,3.1e-05,0.000931,0.002579,0.253738


In [22]:
print("Validation Data:")
val_data.head(44)

Validation Data:


Unnamed: 0,area_0,area_1,area_2,area_3,area_4,area_5,area_6,area_7,area_8,area_9,...,color_500,color_503,color_504,color_505,color_506,color_507,color_508,color_509,color_510,color_511
180,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.003601,4.6e-05,0.0,0.0,0.0,0.0,0.0,0.085876
18,6684.0,0.0,0.0,0.0,0.5,25.5,0.0,2.0,1.0,14.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.203156
141,3315.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.032425,0.0047,0.000504,0.0,1.5e-05,0.001785,0.00502,0.227859
75,0.0,0.0,64689.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.024048,0.001465,0.000153,0.0,1.5e-05,0.001099,0.002823,0.23645
186,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.004059,0.000488,4.6e-05,0.0,0.0,0.000198,0.000519,0.198547
172,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000717,0.0,0.0,0.0,0.0,0.0,0.00029,0.041962
19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.001678,0.000153,1.5e-05,1.5e-05,1.5e-05,6.1e-05,0.000305,0.079987
82,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,64479.5,0.0,...,0.0,3.1e-05,0.037781,0.003983,0.000458,1.5e-05,0.000229,0.003525,0.007385,0.258255
150,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.018936,0.003128,0.000366,0.0,0.0,0.001587,0.003586,0.349228
26,0.0,0.0,0.0,0.0,2.0,131.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000977,0.000137,0.0,0.0,0.0,6.1e-05,7.6e-05,0.0354


## Classification Modelling (SVM, Random Forest, Naive Bayes)

In [27]:
svm_classifier = SVC()
rf_classifier = RandomForestClassifier()
nb_classifier = GaussianNB()

classifiers = [svm_classifier, rf_classifier, nb_classifier]

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='weighted', zero_division=1),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1_score': make_scorer(f1_score, average='weighted')
}

# k-fold validation
# k = 10 
# results = {}
# for classifier in classifiers:
#     classifier_name = type(classifier).__name__
#     print("Running k-fold cross-validation for", classifier_name)
#     cv_results = cross_validate(classifier, train_data, train_labels, cv=k, scoring=scoring)
#     results[classifier_name] = cv_results

# for classifier_name, cv_results in results.items():
#     print("\nResults for", classifier_name)
#     print("Accuracy:", cv_results['test_accuracy'])
#     print("Precision:", cv_results['test_precision'])
#     print("Recall:", cv_results['test_recall'])
#     print("F1 Score:", cv_results['test_f1_score'])
    
k = 10
results = {}
for classifier in classifiers:
    classifier_name = type(classifier).__name__
    print("Running k-fold cross-validation for", classifier_name)
    cv_results = cross_validate(classifier, train_data, train_labels, cv=k, scoring=scoring)
    results[classifier_name] = cv_results

for classifier_name, cv_results in results.items():
    print("\nResults for", classifier_name)
    print("Accuracy:", np.mean(cv_results['test_accuracy']))
    print("Precision:", np.mean(cv_results['test_precision']))
    print("Recall:", np.mean(cv_results['test_recall']))
    print("F1 Score:", np.mean(cv_results['test_f1_score']))

Running k-fold cross-validation for SVC
Running k-fold cross-validation for RandomForestClassifier
Running k-fold cross-validation for GaussianNB

Results for SVC
Accuracy: 0.5467032967032968
Precision: 0.5751495726495727
Recall: 0.5467032967032968
F1 Score: 0.4888256554190621

Results for RandomForestClassifier
Accuracy: 0.9087912087912089
Precision: 0.9248901098901099
Recall: 0.9087912087912089
F1 Score: 0.9083267922553638

Results for GaussianNB
Accuracy: 0.3730769230769231
Precision: 0.41290293040293047
Recall: 0.3730769230769231
F1 Score: 0.32143213928928216


In [24]:
# training model
svm_classifier.fit(train_data, train_labels)
rf_classifier.fit(train_data, train_labels)
nb_classifier.fit(train_data, train_labels)

# validating model
svm_val_predictions = svm_classifier.predict(val_data)
rf_val_predictions = rf_classifier.predict(val_data)
nb_val_predictions = nb_classifier.predict(val_data)

svm_val_accuracy = accuracy_score(val_labels, svm_val_predictions)
svm_val_precision = precision_score(val_labels, svm_val_predictions, average='weighted')
svm_val_recall = recall_score(val_labels, svm_val_predictions, average='weighted')
svm_val_f1_score = f1_score(val_labels, svm_val_predictions, average='weighted')

rf_val_accuracy = accuracy_score(val_labels, rf_val_predictions)
rf_val_precision = precision_score(val_labels, rf_val_predictions, average='weighted')
rf_val_recall = recall_score(val_labels, rf_val_predictions, average='weighted')
rf_val_f1_score = f1_score(val_labels, rf_val_predictions, average='weighted')

nb_val_accuracy = accuracy_score(val_labels, nb_val_predictions)
nb_val_precision = precision_score(val_labels, nb_val_predictions, average='weighted')
nb_val_recall = recall_score(val_labels, nb_val_predictions, average='weighted')
nb_val_f1_score = f1_score(val_labels, nb_val_predictions, average='weighted')

svm_val_confusion_matrix = confusion_matrix(val_labels, svm_val_predictions)
rf_val_confusion_matrix = confusion_matrix(val_labels, rf_val_predictions)
nb_val_confusion_matrix = confusion_matrix(val_labels, nb_val_predictions)

In [25]:
# testing model
svm_test_predictions = svm_classifier.predict(test_data)
rf_test_predictions = rf_classifier.predict(test_data)
nb_test_predictions = nb_classifier.predict(test_data)

svm_test_accuracy = accuracy_score(test_labels, svm_test_predictions)
svm_test_precision = precision_score(test_labels, svm_test_predictions, average='weighted')
svm_test_recall = recall_score(test_labels, svm_test_predictions, average='weighted')
svm_test_f1_score = f1_score(test_labels, svm_test_predictions, average='weighted')

rf_test_accuracy = accuracy_score(test_labels, rf_test_predictions)
rf_test_precision = precision_score(test_labels, rf_test_predictions, average='weighted')
rf_test_recall = recall_score(test_labels, rf_test_predictions, average='weighted')
rf_test_f1_score = f1_score(test_labels, rf_test_predictions, average='weighted')

nb_test_accuracy = accuracy_score(test_labels, nb_test_predictions)
nb_test_precision = precision_score(test_labels, nb_test_predictions, average='weighted')
nb_test_recall = recall_score(test_labels, nb_test_predictions, average='weighted')
nb_test_f1_score = f1_score(test_labels, nb_test_predictions, average='weighted')

svm_test_confusion_matrix = confusion_matrix(test_labels, svm_test_predictions)
rf_test_confusion_matrix = confusion_matrix(test_labels, rf_test_predictions)
nb_test_confusion_matrix = confusion_matrix(test_labels, nb_test_predictions)

In [26]:
# calculate accuracy, precision, recall, f1 score and confusion metric
print("\nValidation Set Metrics:")
print("SVM Accuracy:", svm_val_accuracy)
print("SVM Precision:", svm_val_precision)
print("SVM Recall:", svm_val_recall)
print("SVM F1 Score:", svm_val_f1_score)
print("SVM Confusion Matrix:\n", svm_val_confusion_matrix)
print("\nRandom Forest Accuracy:", rf_val_accuracy)
print("Random Forest Precision:", rf_val_precision)
print("Random Forest Recall:", rf_val_recall)
print("Random Forest F1 Score:", rf_val_f1_score)
print("Random Forest Confusion Matrix:\n", rf_val_confusion_matrix)
print("\nNaive Bayes Accuracy:", nb_val_accuracy)
print("Naive Bayes Precision:", nb_val_precision)
print("Naive Bayes Recall:", nb_val_recall)
print("Naive Bayes F1 Score:", nb_val_f1_score)
print("Naive Bayes Confusion Matrix:\n", nb_val_confusion_matrix)


print("\nTesting Set Metrics:")
print("SVM Accuracy:", svm_test_accuracy)
print("SVM Precision:", svm_test_precision)
print("SVM Recall:", svm_test_recall)
print("SVM F1 Score:", svm_test_f1_score)
print("SVM Confusion Matrix:\n", svm_test_confusion_matrix)
print("\nRandom Forest Accuracy:", rf_test_accuracy)
print("Random Forest Precision:", rf_test_precision)
print("Random Forest Recall:", rf_test_recall)
print("Random Forest F1 Score:", rf_test_f1_score)
print("Random Forest Confusion Matrix:\n", rf_test_confusion_matrix)
print("\nNaive Bayes Accuracy:", nb_test_accuracy)
print("Naive Bayes Precision:", nb_test_precision)
print("Naive Bayes Recall:", nb_test_recall)
print("Naive Bayes F1 Score:", nb_test_f1_score)
print("Naive Bayes Confusion Matrix:\n", nb_test_confusion_matrix)


Validation Set Metrics:
SVM Accuracy: 0.5227272727272727
SVM Precision: 0.5056067669704034
SVM Recall: 0.5227272727272727
SVM F1 Score: 0.46921709599745576
SVM Confusion Matrix:
 [[11  0  0  0]
 [ 0  5  1  7]
 [ 7  2  1  0]
 [ 0  4  0  6]]

Random Forest Accuracy: 0.9090909090909091
Random Forest Precision: 0.9168388429752067
Random Forest Recall: 0.9090909090909091
Random Forest F1 Score: 0.9080617866981502
Random Forest Confusion Matrix:
 [[ 9  1  0  1]
 [ 0 11  2  0]
 [ 0  0 10  0]
 [ 0  0  0 10]]

Naive Bayes Accuracy: 0.4090909090909091
Naive Bayes Precision: 0.5260416666666666
Naive Bayes Recall: 0.4090909090909091
Naive Bayes F1 Score: 0.3474920064261757
Naive Bayes Confusion Matrix:
 [[9 1 1 0]
 [6 7 0 0]
 [6 3 1 0]
 [3 5 1 1]]

Testing Set Metrics:
SVM Accuracy: 0.5454545454545454
SVM Precision: 0.6934400826446281
SVM Recall: 0.5454545454545454
SVM F1 Score: 0.5075757575757576
SVM Confusion Matrix:
 [[9 0 0 0]
 [1 2 1 8]
 [2 0 5 6]
 [0 0 2 8]]

Random Forest Accuracy: 0.93181