## Grayscale Image UNet Segmentation

In [30]:
import os
import cv2
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
from PIL import Image, ImageOps
from tensorflow.keras.models import load_model
from custom_functions import dice_loss, dice_coefficient, iou

In [31]:
folder_path = r'C:\Users\BoonJane\Desktop\oralcancer\Grayscaleimage' 
class_names = ['Normal', 'Hyperchromasia', 'Atypical mitotic figure', 'Abnormal variation in nuclear shape'] 

In [3]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
data = pd.DataFrame(columns=['image_path', 'label'])

for class_index, class_name in enumerate(class_names):
    class_folder = os.path.join(folder_path, class_name)
    files = os.listdir(class_folder)

    # run all files and add it into dataframe
    for file_name in files:
        file_path = os.path.join(class_folder, file_name)
        data = data.append({'image_path': file_path, 'label': class_index}, ignore_index=True)

In [5]:
print(data.head(220)) 

                                            image_path label
0    C:\Users\BoonJane\Desktop\oralcancer\Grayscale...     0
1    C:\Users\BoonJane\Desktop\oralcancer\Grayscale...     0
2    C:\Users\BoonJane\Desktop\oralcancer\Grayscale...     0
3    C:\Users\BoonJane\Desktop\oralcancer\Grayscale...     0
4    C:\Users\BoonJane\Desktop\oralcancer\Grayscale...     0
..                                                 ...   ...
215  C:\Users\BoonJane\Desktop\oralcancer\Grayscale...     3
216  C:\Users\BoonJane\Desktop\oralcancer\Grayscale...     3
217  C:\Users\BoonJane\Desktop\oralcancer\Grayscale...     3
218  C:\Users\BoonJane\Desktop\oralcancer\Grayscale...     3
219  C:\Users\BoonJane\Desktop\oralcancer\Grayscale...     3

[220 rows x 2 columns]


In [6]:
data.to_csv('datagray.csv', index=False)

In [7]:
# Open an image and a mask
image_path = r"C:\Users\BoonJane\Desktop\oralcancer\Grayscaleimage\Abnormal variation in nuclear shape\image_165.jpg"

image = Image.open(image_path)

# Get the size of the image and mask using the size attribute
image_size = image.size

print("Image size:", image_size)

# Convert the images to NumPy arrays
image_array = np.array(image)

# Get the shape of the arrays using the shape attribute
image_shape = image_array.shape

print("Image shape:", image_shape)


Image size: (256, 256)
Image shape: (256, 256)


In [8]:
custom_objects = {
    'dice_loss': dice_loss,
    'dice_coefficient': dice_coefficient,
    'iou': iou
}

model = load_model('unetgray.h5', custom_objects=custom_objects)

In [9]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 256, 256, 1  0           []                               
                                )]                                                                
                                                                                                  
 conv2d_23 (Conv2D)             (None, 256, 256, 64  640         ['input_2[0][0]']                
                                )                                                                 
                                                                                                  
 conv2d_24 (Conv2D)             (None, 256, 256, 64  36928       ['conv2d_23[0][0]']              
                                )                                                           

                                                                                                  
 up_sampling2d_7 (UpSampling2D)  (None, 256, 256, 12  0          ['conv2d_41[0][0]']              
                                8)                                                                
                                                                                                  
 conv2d_42 (Conv2D)             (None, 256, 256, 64  32832       ['up_sampling2d_7[0][0]']        
                                )                                                                 
                                                                                                  
 concatenate_7 (Concatenate)    (None, 256, 256, 12  0           ['conv2d_24[0][0]',              
                                8)                                'conv2d_42[0][0]']              
                                                                                                  
 conv2d_43

In [10]:
model.input

<KerasTensor: shape=(None, 256, 256, 1) dtype=float32 (created by layer 'input_2')>

In [11]:
model.output

<KerasTensor: shape=(None, 256, 256, 1) dtype=float32 (created by layer 'conv2d_45')>

In [12]:
data = pd.read_csv('datagray.csv')

In [13]:
def preprocess_image(image):
    # resize (make sure input image is 256*256)
    resized_image = cv2.resize(image, (256, 256))

    # grayscale
    gray_image = cv2.cvtColor(resized_image, cv2.COLOR_BGR2GRAY)

    # normalized
    normalized_image = gray_image / 255.0

    # adjust image shape to (1, 256, 256, 1)
    processed_image = np.expand_dims(normalized_image, axis=0)  
    processed_image = np.expand_dims(processed_image, axis=-1)  

    return processed_image

In [14]:
def postprocess_image(segmented_image, threshold=0.5):
    # convert from 0 and 1 to 0 and 255 (able to see the mask)
    binary_image = np.where(segmented_image >= threshold, 0, 255).astype(np.uint8)
    return [binary_image]

In [15]:
output_folder = r"C:\Users\BoonJane\Desktop\oralcancer\unetgray\segmented_mask"
output_folder2 = r"C:\Users\BoonJane\Desktop\oralcancer\unetgray\postprocessed_mask"
os.makedirs(output_folder, exist_ok=True)
os.makedirs(output_folder2, exist_ok=True)

In [16]:
class_mapping = {
    0: 'Normal',
    1: 'Hyperchromasia',
    2: 'Atypical mitotic figure',
    3: 'Abnormal variation in nuclear shape'
}

In [17]:
for index, row in data.iterrows():
    image_path = row['image_path']
    label = row['label']

    image = cv2.imread(image_path)

    preprocessed_image = preprocess_image(image)
    print('preprocessed image shape:', preprocessed_image.shape)

    segmented_mask = model.predict(preprocessed_image) 
    print(segmented_mask.shape)

    postprocessed_mask = postprocess_image(segmented_mask, threshold=0.7)
    
    # save the segmented image to local folder
    for i, mask in enumerate(segmented_mask):
        class_name = class_mapping[label]  
        label_folder = os.path.join(output_folder, class_name)
        os.makedirs(label_folder, exist_ok=True)

        resized_mask = cv2.resize(mask.reshape((256, 256)), (256, 256))
        rgb_image = cv2.cvtColor(resized_mask, cv2.COLOR_GRAY2RGB)
        gray_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2GRAY)

        print(gray_image.shape)

        save_path = os.path.join(label_folder, f'{class_name}_{index}_{i}.jpg')
        cv2.imwrite(save_path, gray_image)
        
    # save the postprocessed mask to local folder
    for i, mask in enumerate(postprocessed_mask):
        class_name = class_mapping[label]  
        label_folder = os.path.join(output_folder2, class_name)
        os.makedirs(label_folder, exist_ok=True)

        resized_mask = cv2.resize(mask.reshape((256, 256)), (256, 256))
        rgb_image = cv2.cvtColor(resized_mask, cv2.COLOR_GRAY2RGB)
        gray_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2GRAY)

        print(gray_image.shape)

        save_path = os.path.join(label_folder, f'{class_name}_{index}_{i}.jpg')
        cv2.imwrite(save_path, gray_image)

preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed ima

(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)

(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)

## Data Preparation for Classification

In [27]:
import os
import cv2
import numpy as np
import tensorflow as tf
import pandas as pd
import warnings
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from keras.models import load_model
from skimage.feature import local_binary_pattern
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer

In [28]:
folder_path = r"C:\Users\BoonJane\Desktop\oralcancer\unetgray\segmented_mask" 
folder2_path = r"C:\Users\BoonJane\Desktop\oralcancer\unetgray\postprocessed_mask" 
class_names = ['Normal', 'Hyperchromasia', 'Atypical mitotic figure', 'Abnormal variation in nuclear shape'] 

In [29]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [30]:
segmenteddata = pd.DataFrame(columns=['segmentedmask_path', 'class'])

for class_index, class_name in enumerate(class_names):
    class_folder = os.path.join(folder_path, class_name)
    files = os.listdir(class_folder)

    # run all files and add it into dataframe
    for file_name in files:
        file_path = os.path.join(class_folder, file_name)
        segmenteddata = segmenteddata.append({'segmentedmask_path': file_path, 'class': class_index}, ignore_index=True)

In [31]:
postprocesseddata = pd.DataFrame(columns=['postprocessedmask_path', 'class'])

for class_index, class_name in enumerate(class_names):
    class_folder = os.path.join(folder2_path, class_name)
    files = os.listdir(class_folder)

    # run all files and add it into dataframe
    for file_name in files:
        file_path = os.path.join(class_folder, file_name)
        postprocesseddata = postprocesseddata.append({'postprocessedmask_path': file_path, 'class': class_index}, ignore_index=True)

In [32]:
segmenteddata.head(220)

Unnamed: 0,segmentedmask_path,class
0,C:\Users\BoonJane\Desktop\oralcancer\unetgray\...,0
1,C:\Users\BoonJane\Desktop\oralcancer\unetgray\...,0
2,C:\Users\BoonJane\Desktop\oralcancer\unetgray\...,0
3,C:\Users\BoonJane\Desktop\oralcancer\unetgray\...,0
4,C:\Users\BoonJane\Desktop\oralcancer\unetgray\...,0
...,...,...
215,C:\Users\BoonJane\Desktop\oralcancer\unetgray\...,3
216,C:\Users\BoonJane\Desktop\oralcancer\unetgray\...,3
217,C:\Users\BoonJane\Desktop\oralcancer\unetgray\...,3
218,C:\Users\BoonJane\Desktop\oralcancer\unetgray\...,3


In [33]:
postprocesseddata.head(220)

Unnamed: 0,postprocessedmask_path,class
0,C:\Users\BoonJane\Desktop\oralcancer\unetgray\...,0
1,C:\Users\BoonJane\Desktop\oralcancer\unetgray\...,0
2,C:\Users\BoonJane\Desktop\oralcancer\unetgray\...,0
3,C:\Users\BoonJane\Desktop\oralcancer\unetgray\...,0
4,C:\Users\BoonJane\Desktop\oralcancer\unetgray\...,0
...,...,...
215,C:\Users\BoonJane\Desktop\oralcancer\unetgray\...,3
216,C:\Users\BoonJane\Desktop\oralcancer\unetgray\...,3
217,C:\Users\BoonJane\Desktop\oralcancer\unetgray\...,3
218,C:\Users\BoonJane\Desktop\oralcancer\unetgray\...,3


## Feature Extraction

In [34]:
def calculate_area(postprocessed_mask):
    contours, _ = cv2.findContours(postprocessed_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    areas = np.array([cv2.contourArea(contour) for contour in contours])
    if len(areas) < max_contour_count:
        areas = np.concatenate([areas, default_area[len(areas):]])
    return areas

In [35]:
def calculate_perimeter(postprocessed_mask):
    contours, _ = cv2.findContours(postprocessed_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    perimeters = np.array([cv2.arcLength(contour, closed=True) for contour in contours])
    if len(perimeters) < max_contour_count:
        perimeters = np.concatenate([perimeters, default_perimeter[len(perimeters):]])
    return perimeters

In [36]:
def calculate_circularity(postprocessed_mask):
    perimeters = calculate_perimeter(postprocessed_mask)
    areas = calculate_area(postprocessed_mask)
    circularities = []
    for perimeter, area in zip(perimeters, areas):
        if perimeter == 0:
            circularity = 0
        else:
            circularity = (4 * np.pi * area) / (perimeter ** 2)
        circularities.append(circularity)
    circularities = np.array(circularities)
    if len(circularities) < max_contour_count:
        circularities = np.concatenate([circularities, default_circularity[len(circularities):]])
    return circularities

In [37]:
def calculate_color(postprocessed_mask):
    if postprocessed_mask.ndim == 3 and postprocessed_mask.shape[2] == 3:
        rgb_image = postprocessed_mask
    else:
        rgb_image = cv2.cvtColor(postprocessed_mask, cv2.COLOR_GRAY2RGB)
        
    pixels = rgb_image.reshape(-1, 3)  
    histogram, _ = np.histogramdd(pixels, bins=(8, 8, 8), range=[(0, 255), (0, 255), (0, 255)])
    color_feature = histogram.flatten() / np.sum(histogram)  
    
    return color_feature

In [38]:
max_contour_count = 372
default_area = np.zeros((max_contour_count,))
default_perimeter = np.zeros((max_contour_count,))
default_circularity = np.zeros((max_contour_count,))

In [39]:
postprocesseddata['class'] = postprocesseddata['class'].astype('int')

In [40]:
# extract feature (postprocesseddata)
def extract_area_features(postprocessedmask_path):
    postprocessed_mask = cv2.imread(postprocessedmask_path, 0)
    area = calculate_area(postprocessed_mask)
    return area.flatten()

def extract_perimeter_features(postprocessedmask_path):
    postprocessed_mask = cv2.imread(postprocessedmask_path, 0)
    perimeter = calculate_perimeter(postprocessed_mask)
    return perimeter.flatten()

def extract_circularity_features(postprocessedmask_path):
    postprocessed_mask = cv2.imread(postprocessedmask_path, 0)
    circularity = calculate_circularity(postprocessed_mask)
    return circularity.flatten()

def extract_color_features(postprocessedmask_path):
    postprocessed_mask = cv2.imread(postprocessedmask_path)
    color = calculate_color(postprocessed_mask)
    return color.flatten()

# apply function on the image
area_features = postprocesseddata['postprocessedmask_path'].apply(extract_area_features)
perimeter_features = postprocesseddata['postprocessedmask_path'].apply(extract_perimeter_features)
circularity_features = postprocesseddata['postprocessedmask_path'].apply(extract_circularity_features)
color_features = postprocesseddata['postprocessedmask_path'].apply(extract_color_features)

# make each number in the array separate into each column
area_features = np.vstack(area_features)
perimeter_features = np.vstack(perimeter_features)
circularity_features = np.vstack(circularity_features)
color_features = np.vstack(color_features)

# name the columns
area_columns = [f"area_{i}" for i in range(area_features.shape[1])]
perimeter_columns = [f"perimeter_{i}" for i in range(perimeter_features.shape[1])]
circularity_columns = [f"circularity_{i}" for i in range(circularity_features.shape[1])]
color_columns = [f"color_{i}" for i in range(color_features.shape[1])]

area_data=pd.DataFrame(area_features, columns=area_columns)
perimeter_data=pd.DataFrame(perimeter_features, columns=perimeter_columns)
circularity_data=pd.DataFrame(circularity_features, columns=circularity_columns)
color_data=pd.DataFrame(color_features, columns=color_columns)

# find the column which is 0 and delete the column
zero_columns = area_data.columns[(area_data == 0).all()]
area_data = area_data.loc[:, ~area_data.columns.isin(zero_columns)]

zero2_columns = perimeter_data.columns[(perimeter_data == 0).all()]
perimeter_data = perimeter_data.loc[:, ~perimeter_data.columns.isin(zero_columns)]

zero3_columns = circularity_data.columns[(circularity_data == 0).all()]
circularity_data = circularity_data.loc[:, ~circularity_data.columns.isin(zero_columns)]

zero_columns = color_data.columns[(color_data == 0).all()]
color_data = color_data.loc[:, ~color_data.columns.isin(zero_columns)]

area_data.to_csv('areagray_unetdata.csv', index=False)
perimeter_data.to_csv('perimetergray_unetdata.csv', index=False)
circularity_data.to_csv('circularitygray_unetdata.csv', index=False)
color_data.to_csv('colorgray_unetdata.csv', index=False)

In [41]:
def calculate_texture(segmented_mask):
    if segmented_mask.ndim == 3 and segmented_mask.shape[2] == 3:
        gray_image = cv2.cvtColor(segmented_mask, cv2.COLOR_RGB2GRAY)
    else:
        gray_image = segmented_mask

    if gray_image.ndim != 2:
        gray_image = gray_image[:, :, 0]

    lbp = local_binary_pattern(gray_image, 8, 1, method='uniform')
    histogram, _ = np.histogram(lbp, bins=np.arange(0, 9), range=(0, 8))
    texture = histogram / np.sum(histogram)
    return texture

In [42]:
segmenteddata['class'] = segmenteddata['class'].astype('int')

In [43]:
# extract feature (segmented data)
def extract_texture_features(segmentedmask_path):
    segmented_mask = cv2.imread(segmentedmask_path, 0)
    texture = calculate_texture(segmented_mask)
    return texture.flatten()

# apply function on the image
texture_features = segmenteddata['segmentedmask_path'].apply(extract_texture_features)

# make each number in the array separate into each column
texture_features = np.vstack(texture_features)

# name the columns
texture_columns = [f"texture_{i}" for i in range(texture_features.shape[1])]

texture_data=pd.DataFrame(texture_features, columns=texture_columns)

texture_data.to_csv('texturegray_unetdata.csv', index=False)

## Feature Data for Classification

In [44]:
# combine all the features data into dataframe
data = pd.concat([area_data, perimeter_data, circularity_data, texture_data, color_data], axis=1)
labels = segmenteddata['class']

print()
data.head(5)




Unnamed: 0,area_0,area_1,area_2,area_3,area_4,area_5,area_6,area_7,area_8,area_9,...,texture_0,texture_1,texture_2,texture_3,texture_4,texture_5,texture_6,texture_7,color_0,color_511
0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00029,0.000871,0.0,0.006206,1.5e-05,0.033646,0.000199,0.958772,0.849564,0.150436
1,2.5,4.5,0.0,0.0,0.5,0.5,0.5,0.0,0.0,0.0,...,4.6e-05,9.2e-05,0.0,0.00061,0.0,0.016467,0.0,0.982785,0.745926,0.254074
2,0.0,0.0,0.0,3.0,0.0,0.0,7491.0,0.0,0.0,0.5,...,6.1e-05,3.1e-05,0.0,0.000427,0.0,0.015412,0.0,0.984069,0.761017,0.238983
3,0.0,1.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.000183,3.1e-05,0.0,0.002442,0.0,0.024425,0.0,0.972919,0.868271,0.131729
4,0.0,0.0,1.0,2.0,0.0,0.5,0.0,0.5,7.0,0.0,...,0.0,1.5e-05,0.0,0.000153,0.0,0.014649,0.0,0.985183,0.890778,0.109222


In [45]:
# training set, testing set and validation set
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.4, random_state=42)
val_data, test_data, val_labels, test_labels = train_test_split(test_data, test_labels, test_size=0.5, random_state=42)

In [46]:
# check on training set, testing set and validation set
print("Train Data:")
train_data.head(132)

Train Data:


Unnamed: 0,area_0,area_1,area_2,area_3,area_4,area_5,area_6,area_7,area_8,area_9,...,texture_0,texture_1,texture_2,texture_3,texture_4,texture_5,texture_6,texture_7,color_0,color_511
146,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000183,0.000473,0.000000,0.007114,0.000000,0.030364,0.000000,0.961866,0.557968,0.442032
173,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.001775,0.005447,0.000031,0.036881,0.000509,0.088823,0.000664,0.865870,0.584503,0.415497
206,0.0,0.0,0.0,0.0,2.0,0.5,0.0,0.0,0.0,0.0,...,0.001124,0.004250,0.000046,0.027577,0.000354,0.071430,0.000246,0.894973,0.512466,0.487534
0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000290,0.000871,0.000000,0.006206,0.000015,0.033646,0.000199,0.958772,0.849564,0.150436
2,0.0,0.0,0.0,3.0,0.0,0.0,7491.0,0.0,0.0,0.5,...,0.000061,0.000031,0.000000,0.000427,0.000000,0.015412,0.000000,0.984069,0.761017,0.238983
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,0.0,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.000890,0.002531,0.000000,0.023825,0.000138,0.071951,0.000123,0.900542,0.645370,0.354630
14,0.0,0.0,0.0,1.0,0.0,0.0,0.5,0.0,0.0,12.5,...,0.000000,0.000092,0.000000,0.000488,0.000000,0.014817,0.000000,0.984603,0.541245,0.458755
92,5610.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000939,0.003185,0.000015,0.025143,0.000200,0.058102,0.000231,0.912185,0.341843,0.658157
179,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.000906,0.002578,0.000015,0.020075,0.000230,0.052872,0.000384,0.922940,0.525391,0.474609


In [47]:
print("Test Data:")
test_data.head(44)

Test Data:


Unnamed: 0,area_0,area_1,area_2,area_3,area_4,area_5,area_6,area_7,area_8,area_9,...,texture_0,texture_1,texture_2,texture_3,texture_4,texture_5,texture_6,texture_7,color_0,color_511
84,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.0,...,0.001184,0.003367,1.5e-05,0.029273,0.0002,0.06625,0.000384,0.899327,0.372253,0.627747
132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000352,0.0013,0.0,0.015156,4.6e-05,0.038372,9.2e-05,0.944683,0.369965,0.630035
97,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.000703,0.001177,0.0,0.020238,6.1e-05,0.05388,3.1e-05,0.923911,0.654907,0.345093
210,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.5,0.0,...,0.00075,0.00176,1.5e-05,0.019363,0.000122,0.049669,0.000138,0.928182,0.67424,0.32576
190,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.5,0.0,...,0.000537,0.002592,3.1e-05,0.015673,0.000107,0.039428,0.00023,0.941403,0.180328,0.819672
65,3570.0,0.0,1.0,0.5,0.0,0.0,0.0,0.5,1.0,0.0,...,0.000723,0.002354,0.0,0.027196,9.2e-05,0.049439,0.000138,0.920057,0.346649,0.653351
100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000597,0.001822,0.0,0.022528,3.1e-05,0.059851,0.000184,0.914987,0.391266,0.608734
197,0.0,0.0,0.0,0.0,0.0,0.5,1.0,0.0,0.0,0.0,...,0.00149,0.004239,0.0,0.036061,0.0002,0.079571,0.000415,0.878026,0.621277,0.378723
15,0.0,0.0,3879.5,0.0,0.0,0.0,0.0,13.5,0.0,0.0,...,1.5e-05,0.0,0.0,0.000336,0.0,0.017059,0.0,0.98259,0.873489,0.126511
112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.00026,0.000902,0.0,0.009339,1.5e-05,0.033689,7.6e-05,0.955718,0.597961,0.402039


In [48]:
print("Validation Data:")
val_data.head(44)

Validation Data:


Unnamed: 0,area_0,area_1,area_2,area_3,area_4,area_5,area_6,area_7,area_8,area_9,...,texture_0,texture_1,texture_2,texture_3,texture_4,texture_5,texture_6,texture_7,color_0,color_511
180,0.0,0.0,0.0,2.0,0.5,11.5,0.0,0.0,0.5,0.0,...,0.00123,0.003981,1.5e-05,0.03099,0.000354,0.082777,0.00043,0.880223,0.585312,0.414688
18,0.0,0.0,0.0,0.5,0.0,0.0,0.0,1.5,0.0,0.0,...,0.0,0.0,0.0,0.00029,0.0,0.015412,0.0,0.984298,0.794052,0.205948
141,3315.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.000168,4.6e-05,0.0,0.00531,0.0,0.024813,0.0,0.969663,0.622818,0.377182
75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,...,0.000489,0.000825,0.0,0.014089,3.1e-05,0.044025,0.000107,0.940434,0.58194,0.41806
186,0.0,0.0,0.5,0.0,0.0,0.0,11.0,0.0,0.0,0.5,...,0.000998,0.003548,0.0,0.022826,0.000169,0.063088,0.000261,0.909109,0.54039,0.45961
172,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0014,0.004877,6.2e-05,0.037908,0.000415,0.080677,0.000492,0.874169,0.602341,0.397659
19,0.0,4.0,11.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.001043,0.002056,1.5e-05,0.017769,0.000153,0.075158,0.000583,0.903221,0.822144,0.177856
82,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,...,0.001211,0.002499,0.0,0.028332,0.000199,0.074096,0.000291,0.893371,0.453979,0.546021
150,0.0,0.0,0.0,0.0,0.0,1.5,0.0,0.0,0.0,0.0,...,0.000137,0.00026,0.0,0.005205,6.1e-05,0.024057,3.1e-05,0.970249,0.526932,0.473068
26,0.5,0.0,0.0,11.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000489,0.00055,0.0,0.006998,7.6e-05,0.039159,9.2e-05,0.952636,0.914627,0.085373


## Classification Modelling (SVM, Random Forest, Naive Bayes)

In [49]:
svm_classifier = SVC()
rf_classifier = RandomForestClassifier()
nb_classifier = GaussianNB()

classifiers = [svm_classifier, rf_classifier, nb_classifier]

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='weighted', zero_division=1),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1_score': make_scorer(f1_score, average='weighted')
}
# k-fold validation
# k = 10 
# results = {}
# for classifier in classifiers:
#     classifier_name = type(classifier).__name__
#     print("Running k-fold cross-validation for", classifier_name)
#     cv_results = cross_validate(classifier, train_data, train_labels, cv=k, scoring=scoring)
#     results[classifier_name] = cv_results

# for classifier_name, cv_results in results.items():
#     print("\nResults for", classifier_name)
#     print("Accuracy:", cv_results['test_accuracy'])
#     print("Precision:", cv_results['test_precision'])
#     print("Recall:", cv_results['test_recall'])
#     print("F1 Score:", cv_results['test_f1_score'])
    
k = 10
results = {}
for classifier in classifiers:
    classifier_name = type(classifier).__name__
    print("Running k-fold cross-validation for", classifier_name)
    cv_results = cross_validate(classifier, train_data, train_labels, cv=k, scoring=scoring)
    results[classifier_name] = cv_results

for classifier_name, cv_results in results.items():
    print("\nResults for", classifier_name)
    print("Accuracy:", np.mean(cv_results['test_accuracy']))
    print("Precision:", np.mean(cv_results['test_precision']))
    print("Recall:", np.mean(cv_results['test_recall']))
    print("F1 Score:", np.mean(cv_results['test_f1_score']))

Running k-fold cross-validation for SVC
Running k-fold cross-validation for RandomForestClassifier
Running k-fold cross-validation for GaussianNB

Results for SVC
Accuracy: 0.5373626373626373
Precision: 0.6610426478283621
Recall: 0.5373626373626373
F1 Score: 0.4793095793095793

Results for RandomForestClassifier
Accuracy: 0.7489010989010989
Precision: 0.7711630036630036
Recall: 0.7489010989010989
F1 Score: 0.7371916971916972

Results for GaussianNB
Accuracy: 0.33241758241758246
Precision: 0.35457352171637885
Recall: 0.33241758241758246
F1 Score: 0.2777968368627709


In [50]:
# training model
svm_classifier.fit(train_data, train_labels)
rf_classifier.fit(train_data, train_labels)
nb_classifier.fit(train_data, train_labels)

# validating model
svm_val_predictions = svm_classifier.predict(val_data)
rf_val_predictions = rf_classifier.predict(val_data)
nb_val_predictions = nb_classifier.predict(val_data)

svm_val_accuracy = accuracy_score(val_labels, svm_val_predictions)
svm_val_precision = precision_score(val_labels, svm_val_predictions, average='weighted')
svm_val_recall = recall_score(val_labels, svm_val_predictions, average='weighted')
svm_val_f1_score = f1_score(val_labels, svm_val_predictions, average='weighted')

rf_val_accuracy = accuracy_score(val_labels, rf_val_predictions)
rf_val_precision = precision_score(val_labels, rf_val_predictions, average='weighted')
rf_val_recall = recall_score(val_labels, rf_val_predictions, average='weighted')
rf_val_f1_score = f1_score(val_labels, rf_val_predictions, average='weighted')

nb_val_accuracy = accuracy_score(val_labels, nb_val_predictions)
nb_val_precision = precision_score(val_labels, nb_val_predictions, average='weighted')
nb_val_recall = recall_score(val_labels, nb_val_predictions, average='weighted')
nb_val_f1_score = f1_score(val_labels, nb_val_predictions, average='weighted')

svm_val_confusion_matrix = confusion_matrix(val_labels, svm_val_predictions)
rf_val_confusion_matrix = confusion_matrix(val_labels, rf_val_predictions)
nb_val_confusion_matrix = confusion_matrix(val_labels, nb_val_predictions)

In [51]:
# testing model
svm_test_predictions = svm_classifier.predict(test_data)
rf_test_predictions = rf_classifier.predict(test_data)
nb_test_predictions = nb_classifier.predict(test_data)

svm_test_accuracy = accuracy_score(test_labels, svm_test_predictions)
svm_test_precision = precision_score(test_labels, svm_test_predictions, average='weighted')
svm_test_recall = recall_score(test_labels, svm_test_predictions, average='weighted')
svm_test_f1_score = f1_score(test_labels, svm_test_predictions, average='weighted')

rf_test_accuracy = accuracy_score(test_labels, rf_test_predictions)
rf_test_precision = precision_score(test_labels, rf_test_predictions, average='weighted')
rf_test_recall = recall_score(test_labels, rf_test_predictions, average='weighted')
rf_test_f1_score = f1_score(test_labels, rf_test_predictions, average='weighted')

nb_test_accuracy = accuracy_score(test_labels, nb_test_predictions)
nb_test_precision = precision_score(test_labels, nb_test_predictions, average='weighted')
nb_test_recall = recall_score(test_labels, nb_test_predictions, average='weighted')
nb_test_f1_score = f1_score(test_labels, nb_test_predictions, average='weighted')

svm_test_confusion_matrix = confusion_matrix(test_labels, svm_test_predictions)
rf_test_confusion_matrix = confusion_matrix(test_labels, rf_test_predictions)
nb_test_confusion_matrix = confusion_matrix(test_labels, nb_test_predictions)

In [52]:
# calculate accuracy, precision, recall, f1 score and confusion metric
print("\nValidation Set Metrics:")
print("SVM Accuracy:", svm_val_accuracy)
print("SVM Precision:", svm_val_precision)
print("SVM Recall:", svm_val_recall)
print("SVM F1 Score:", svm_val_f1_score)
print("SVM Confusion Matrix:\n", svm_val_confusion_matrix)
print("\nRandom Forest Accuracy:", rf_val_accuracy)
print("Random Forest Precision:", rf_val_precision)
print("Random Forest Recall:", rf_val_recall)
print("Random Forest F1 Score:", rf_val_f1_score)
print("Random Forest Confusion Matrix:\n", rf_val_confusion_matrix)
print("\nNaive Bayes Accuracy:", nb_val_accuracy)
print("Naive Bayes Precision:", nb_val_precision)
print("Naive Bayes Recall:", nb_val_recall)
print("Naive Bayes F1 Score:", nb_val_f1_score)
print("Naive Bayes Confusion Matrix:\n", nb_val_confusion_matrix)


print("\nTesting Set Metrics:")
print("SVM Accuracy:", svm_test_accuracy)
print("SVM Precision:", svm_test_precision)
print("SVM Recall:", svm_test_recall)
print("SVM F1 Score:", svm_test_f1_score)
print("SVM Confusion Matrix:\n", svm_test_confusion_matrix)
print("\nRandom Forest Accuracy:", rf_test_accuracy)
print("Random Forest Precision:", rf_test_precision)
print("Random Forest Recall:", rf_test_recall)
print("Random Forest F1 Score:", rf_test_f1_score)
print("Random Forest Confusion Matrix:\n", rf_test_confusion_matrix)
print("\nNaive Bayes Accuracy:", nb_test_accuracy)
print("Naive Bayes Precision:", nb_test_precision)
print("Naive Bayes Recall:", nb_test_recall)
print("Naive Bayes F1 Score:", nb_test_f1_score)
print("Naive Bayes Confusion Matrix:\n", nb_test_confusion_matrix)


Validation Set Metrics:
SVM Accuracy: 0.4090909090909091
SVM Precision: 0.4148098633392751
SVM Recall: 0.4090909090909091
SVM F1 Score: 0.3672076058645931
SVM Confusion Matrix:
 [[8 0 1 2]
 [1 3 1 8]
 [8 0 1 1]
 [1 3 0 6]]

Random Forest Accuracy: 0.7045454545454546
Random Forest Precision: 0.7089236521054704
Random Forest Recall: 0.7045454545454546
Random Forest F1 Score: 0.7023285486443384
Random Forest Confusion Matrix:
 [[9 0 1 1]
 [2 8 1 2]
 [2 0 8 0]
 [1 3 0 6]]

Naive Bayes Accuracy: 0.29545454545454547
Naive Bayes Precision: 0.28601807549175967
Naive Bayes Recall: 0.29545454545454547
Naive Bayes F1 Score: 0.27920835145715533
Naive Bayes Confusion Matrix:
 [[6 0 3 2]
 [4 4 1 4]
 [5 1 3 1]
 [4 4 2 0]]

Testing Set Metrics:
SVM Accuracy: 0.36363636363636365
SVM Precision: 0.47251082251082244
SVM Recall: 0.36363636363636365
SVM F1 Score: 0.3112292879534259
SVM Confusion Matrix:
 [[8 0 1 0]
 [2 2 1 7]
 [9 0 1 3]
 [1 0 4 5]]

Random Forest Accuracy: 0.6363636363636364
Random Forest 