## Grayscale Image SegNet Segmentation

In [1]:
import os
import cv2
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
from PIL import Image, ImageOps
from tensorflow.keras.models import load_model
from custom_functions import dice_loss, dice_coefficient, iou

In [2]:
folder_path = r'C:\Users\BoonJane\Desktop\oralcancer\Grayscaleimage' 
class_names = ['Normal', 'Hyperchromasia', 'Atypical mitotic figure', 'Abnormal variation in nuclear shape'] 

In [3]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
data = pd.DataFrame(columns=['image_path', 'label'])

for class_index, class_name in enumerate(class_names):
    class_folder = os.path.join(folder_path, class_name)
    files = os.listdir(class_folder)

    # run all files and add it into dataframe
    for file_name in files:
        file_path = os.path.join(class_folder, file_name)
        data = data.append({'image_path': file_path, 'label': class_index}, ignore_index=True)

In [5]:
print(data.head(220)) 

                                            image_path label
0    C:\Users\BoonJane\Desktop\oralcancer\Grayscale...     0
1    C:\Users\BoonJane\Desktop\oralcancer\Grayscale...     0
2    C:\Users\BoonJane\Desktop\oralcancer\Grayscale...     0
3    C:\Users\BoonJane\Desktop\oralcancer\Grayscale...     0
4    C:\Users\BoonJane\Desktop\oralcancer\Grayscale...     0
..                                                 ...   ...
215  C:\Users\BoonJane\Desktop\oralcancer\Grayscale...     3
216  C:\Users\BoonJane\Desktop\oralcancer\Grayscale...     3
217  C:\Users\BoonJane\Desktop\oralcancer\Grayscale...     3
218  C:\Users\BoonJane\Desktop\oralcancer\Grayscale...     3
219  C:\Users\BoonJane\Desktop\oralcancer\Grayscale...     3

[220 rows x 2 columns]


In [6]:
data.to_csv('datagray.csv', index=False)

In [7]:
# Open an image and a mask
image_path = r"C:\Users\BoonJane\Desktop\oralcancer\Grayscaleimage\Abnormal variation in nuclear shape\image_165.jpg"

image = Image.open(image_path)

# Get the size of the image and mask using the size attribute
image_size = image.size

print("Image size:", image_size)

# Convert the images to NumPy arrays
image_array = np.array(image)

# Get the shape of the arrays using the shape attribute
image_shape = image_array.shape

print("Image shape:", image_shape)


Image size: (256, 256)
Image shape: (256, 256)


In [8]:
custom_objects = {
    'dice_loss': dice_loss,
    'dice_coefficient': dice_coefficient,
    'iou': iou
}

model = load_model('segnetgray.h5', custom_objects=custom_objects)

In [9]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 256, 256, 1)]     0         
                                                                 
 conv2d (Conv2D)             (None, 256, 256, 64)      640       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 128, 128, 64)     0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 128, 128, 128)     73856     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 64, 64, 128)      0         
 2D)                                                             
                                                                 
 conv2d_2 (Conv2D)           (None, 64, 64, 256)       295168

In [10]:
model.input

<KerasTensor: shape=(None, 256, 256, 1) dtype=float32 (created by layer 'input_1')>

In [11]:
model.output

<KerasTensor: shape=(None, 256, 256, 1) dtype=float32 (created by layer 'conv2d_6')>

In [12]:
data = pd.read_csv('datagray.csv')

In [13]:
def preprocess_image(image):
    # resize (make sure input image is 256*256)
    resized_image = cv2.resize(image, (256, 256))

    # grayscale
    gray_image = cv2.cvtColor(resized_image, cv2.COLOR_BGR2GRAY)

    # normalized
    normalized_image = gray_image / 255.0

    # adjust image shape to (1, 256, 256, 1)
    processed_image = np.expand_dims(normalized_image, axis=0)  
    processed_image = np.expand_dims(processed_image, axis=-1)  

    return processed_image

In [14]:
def postprocess_image(segmented_image, threshold=0.5):
    # convert from 0 and 1 to 0 and 255 (able to see the mask)
    binary_image = np.where(segmented_image >= threshold, 0, 255).astype(np.uint8)
    return [binary_image]

In [15]:
output_folder = r"C:\Users\BoonJane\Desktop\oralcancer\segnetgray\segmented_mask"
output_folder2 = r"C:\Users\BoonJane\Desktop\oralcancer\segnetgray\postprocessed_mask"
os.makedirs(output_folder, exist_ok=True)
os.makedirs(output_folder2, exist_ok=True)

In [16]:
class_mapping = {
    0: 'Normal',
    1: 'Hyperchromasia',
    2: 'Atypical mitotic figure',
    3: 'Abnormal variation in nuclear shape'
}

In [17]:
for index, row in data.iterrows():
    image_path = row['image_path']
    label = row['label']

    image = cv2.imread(image_path)

    preprocessed_image = preprocess_image(image)
    print('preprocessed image shape:', preprocessed_image.shape)

    segmented_mask = model.predict(preprocessed_image) 
    print(segmented_mask.shape)

    postprocessed_mask = postprocess_image(segmented_mask, threshold=0.7)
    
    # save the segmented image to local folder
    for i, mask in enumerate(segmented_mask):
        class_name = class_mapping[label]  
        label_folder = os.path.join(output_folder, class_name)
        os.makedirs(label_folder, exist_ok=True)

        resized_mask = cv2.resize(mask.reshape((256, 256)), (256, 256))
        rgb_image = cv2.cvtColor(resized_mask, cv2.COLOR_GRAY2RGB)
        gray_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2GRAY)

        print(gray_image.shape)

        save_path = os.path.join(label_folder, f'{class_name}_{index}_{i}.jpg')
        cv2.imwrite(save_path, gray_image)
        
    # save the postprocessed mask to local folder
    for i, mask in enumerate(postprocessed_mask):
        class_name = class_mapping[label]  
        label_folder = os.path.join(output_folder2, class_name)
        os.makedirs(label_folder, exist_ok=True)

        resized_mask = cv2.resize(mask.reshape((256, 256)), (256, 256))
        rgb_image = cv2.cvtColor(resized_mask, cv2.COLOR_GRAY2RGB)
        gray_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2GRAY)

        print(gray_image.shape)

        save_path = os.path.join(label_folder, f'{class_name}_{index}_{i}.jpg')
        cv2.imwrite(save_path, gray_image)

preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed ima

(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)

(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)
(256, 256)
(256, 256)
preprocessed image shape: (1, 256, 256, 1)
(1, 256, 256, 1)

## Data Preparation for Classification

In [1]:
import os
import cv2
import numpy as np
import tensorflow as tf
import pandas as pd
import warnings
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from keras.models import load_model
from skimage.feature import local_binary_pattern
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer

In [2]:
folder_path = r"C:\Users\BoonJane\Desktop\oralcancer\segnetgray\segmented_mask" 
folder2_path = r"C:\Users\BoonJane\Desktop\oralcancer\segnetgray\postprocessed_mask" 
class_names = ['Normal', 'Hyperchromasia', 'Atypical mitotic figure', 'Abnormal variation in nuclear shape'] 

In [3]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
segmenteddata = pd.DataFrame(columns=['segmentedmask_path', 'class'])

for class_index, class_name in enumerate(class_names):
    class_folder = os.path.join(folder_path, class_name)
    files = os.listdir(class_folder)

    # run all files and add it into dataframe
    for file_name in files:
        file_path = os.path.join(class_folder, file_name)
        segmenteddata = segmenteddata.append({'segmentedmask_path': file_path, 'class': class_index}, ignore_index=True)

In [5]:
postprocesseddata = pd.DataFrame(columns=['postprocessedmask_path', 'class'])

for class_index, class_name in enumerate(class_names):
    class_folder = os.path.join(folder2_path, class_name)
    files = os.listdir(class_folder)

    # run all files and add it into dataframe
    for file_name in files:
        file_path = os.path.join(class_folder, file_name)
        postprocesseddata = postprocesseddata.append({'postprocessedmask_path': file_path, 'class': class_index}, ignore_index=True)

In [6]:
segmenteddata.head(220)

Unnamed: 0,segmentedmask_path,class
0,C:\Users\BoonJane\Desktop\oralcancer\segnetgra...,0
1,C:\Users\BoonJane\Desktop\oralcancer\segnetgra...,0
2,C:\Users\BoonJane\Desktop\oralcancer\segnetgra...,0
3,C:\Users\BoonJane\Desktop\oralcancer\segnetgra...,0
4,C:\Users\BoonJane\Desktop\oralcancer\segnetgra...,0
...,...,...
215,C:\Users\BoonJane\Desktop\oralcancer\segnetgra...,3
216,C:\Users\BoonJane\Desktop\oralcancer\segnetgra...,3
217,C:\Users\BoonJane\Desktop\oralcancer\segnetgra...,3
218,C:\Users\BoonJane\Desktop\oralcancer\segnetgra...,3


In [7]:
postprocesseddata.head(220)

Unnamed: 0,postprocessedmask_path,class
0,C:\Users\BoonJane\Desktop\oralcancer\segnetgra...,0
1,C:\Users\BoonJane\Desktop\oralcancer\segnetgra...,0
2,C:\Users\BoonJane\Desktop\oralcancer\segnetgra...,0
3,C:\Users\BoonJane\Desktop\oralcancer\segnetgra...,0
4,C:\Users\BoonJane\Desktop\oralcancer\segnetgra...,0
...,...,...
215,C:\Users\BoonJane\Desktop\oralcancer\segnetgra...,3
216,C:\Users\BoonJane\Desktop\oralcancer\segnetgra...,3
217,C:\Users\BoonJane\Desktop\oralcancer\segnetgra...,3
218,C:\Users\BoonJane\Desktop\oralcancer\segnetgra...,3


## Feature Extraction

In [8]:
def calculate_area(postprocessed_mask):
    contours, _ = cv2.findContours(postprocessed_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    areas = np.array([cv2.contourArea(contour) for contour in contours])
    if len(areas) < max_contour_count:
        areas = np.concatenate([areas, default_area[len(areas):]])
    return areas

In [9]:
def calculate_perimeter(postprocessed_mask):
    contours, _ = cv2.findContours(postprocessed_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    perimeters = np.array([cv2.arcLength(contour, closed=True) for contour in contours])
    if len(perimeters) < max_contour_count:
        perimeters = np.concatenate([perimeters, default_perimeter[len(perimeters):]])
    return perimeters

In [10]:
def calculate_circularity(postprocessed_mask):
    perimeters = calculate_perimeter(postprocessed_mask)
    areas = calculate_area(postprocessed_mask)
    circularities = []
    for perimeter, area in zip(perimeters, areas):
        if perimeter == 0:
            circularity = 0
        else:
            circularity = (4 * np.pi * area) / (perimeter ** 2)
        circularities.append(circularity)
    circularities = np.array(circularities)
    if len(circularities) < max_contour_count:
        circularities = np.concatenate([circularities, default_circularity[len(circularities):]])
    return circularities

In [11]:
def calculate_color(postprocessed_mask):
    if postprocessed_mask.ndim == 3 and postprocessed_mask.shape[2] == 3:
        rgb_image = postprocessed_mask
    else:
        rgb_image = cv2.cvtColor(postprocessed_mask, cv2.COLOR_GRAY2RGB)
        
    pixels = rgb_image.reshape(-1, 3)  
    histogram, _ = np.histogramdd(pixels, bins=(8, 8, 8), range=[(0, 255), (0, 255), (0, 255)])
    color_feature = histogram.flatten() / np.sum(histogram)  
    
    return color_feature

In [12]:
max_contour_count = 380
default_area = np.zeros((max_contour_count,))
default_perimeter = np.zeros((max_contour_count,))
default_circularity = np.zeros((max_contour_count,))

In [13]:
postprocesseddata['class'] = postprocesseddata['class'].astype('int')

In [14]:
# extract feature (postprocesseddata)
def extract_area_features(postprocessedmask_path):
    postprocessed_mask = cv2.imread(postprocessedmask_path, 0)
    area = calculate_area(postprocessed_mask)
    return area.flatten()

def extract_perimeter_features(postprocessedmask_path):
    postprocessed_mask = cv2.imread(postprocessedmask_path, 0)
    perimeter = calculate_perimeter(postprocessed_mask)
    return perimeter.flatten()

def extract_circularity_features(postprocessedmask_path):
    postprocessed_mask = cv2.imread(postprocessedmask_path, 0)
    circularity = calculate_circularity(postprocessed_mask)
    return circularity.flatten()

def extract_color_features(postprocessedmask_path):
    postprocessed_mask = cv2.imread(postprocessedmask_path)
    color = calculate_color(postprocessed_mask)
    return color.flatten()

# apply function on the image
area_features = postprocesseddata['postprocessedmask_path'].apply(extract_area_features)
perimeter_features = postprocesseddata['postprocessedmask_path'].apply(extract_perimeter_features)
circularity_features = postprocesseddata['postprocessedmask_path'].apply(extract_circularity_features)
color_features = postprocesseddata['postprocessedmask_path'].apply(extract_color_features)

# make each number in the array separate into each column
area_features = np.vstack(area_features)
perimeter_features = np.vstack(perimeter_features)
circularity_features = np.vstack(circularity_features)
color_features = np.vstack(color_features)

# name the columns
area_columns = [f"area_{i}" for i in range(area_features.shape[1])]
perimeter_columns = [f"perimeter_{i}" for i in range(perimeter_features.shape[1])]
circularity_columns = [f"circularity_{i}" for i in range(circularity_features.shape[1])]
color_columns = [f"color_{i}" for i in range(color_features.shape[1])]

area_data=pd.DataFrame(area_features, columns=area_columns)
perimeter_data=pd.DataFrame(perimeter_features, columns=perimeter_columns)
circularity_data=pd.DataFrame(circularity_features, columns=circularity_columns)
color_data=pd.DataFrame(color_features, columns=color_columns)

# find the column which is 0 and delete the column
zero_columns = area_data.columns[(area_data == 0).all()]
area_data = area_data.loc[:, ~area_data.columns.isin(zero_columns)]

zero2_columns = perimeter_data.columns[(perimeter_data == 0).all()]
perimeter_data = perimeter_data.loc[:, ~perimeter_data.columns.isin(zero_columns)]

zero3_columns = circularity_data.columns[(circularity_data == 0).all()]
circularity_data = circularity_data.loc[:, ~circularity_data.columns.isin(zero_columns)]

zero_columns = color_data.columns[(color_data == 0).all()]
color_data = color_data.loc[:, ~color_data.columns.isin(zero_columns)]

area_data.to_csv('areagray_segnetdata.csv', index=False)
perimeter_data.to_csv('perimetergray_segnetdata.csv', index=False)
circularity_data.to_csv('circularitygray_segnetdata.csv', index=False)
color_data.to_csv('colorgray_segnetdata.csv', index=False)

In [15]:
def calculate_texture(segmented_mask):
    if segmented_mask.ndim == 3 and segmented_mask.shape[2] == 3:
        gray_image = cv2.cvtColor(segmented_mask, cv2.COLOR_RGB2GRAY)
    else:
        gray_image = segmented_mask

    if gray_image.ndim != 2:
        gray_image = gray_image[:, :, 0]

    lbp = local_binary_pattern(gray_image, 8, 1, method='uniform')
    histogram, _ = np.histogram(lbp, bins=np.arange(0, 9), range=(0, 8))
    texture = histogram / np.sum(histogram)
    return texture

In [16]:
segmenteddata['class'] = segmenteddata['class'].astype('int')

In [17]:
# extract feature (segmented data)
def extract_texture_features(segmentedmask_path):
    segmented_mask = cv2.imread(segmentedmask_path, 0)
    texture = calculate_texture(segmented_mask)
    return texture.flatten()

# apply function on the image
texture_features = segmenteddata['segmentedmask_path'].apply(extract_texture_features)

# make each number in the array separate into each column
texture_features = np.vstack(texture_features)

# name the columns
texture_columns = [f"texture_{i}" for i in range(texture_features.shape[1])]

texture_data=pd.DataFrame(texture_features, columns=texture_columns)

texture_data.to_csv('texturegray_segnetdata.csv', index=False)

## Feature Data for Classification

In [18]:
# combine all the features data into dataframe
data = pd.concat([area_data, perimeter_data, circularity_data, texture_data, color_data], axis=1)
labels = segmenteddata['class']

print()
data.head(5)
# data.to_csv('gray_segnetdata.csv', index=False)




Unnamed: 0,area_0,area_1,area_2,area_3,area_4,area_5,area_6,area_7,area_8,area_9,...,texture_0,texture_1,texture_2,texture_3,texture_4,texture_5,texture_6,texture_7,color_0,color_511
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000107,0.000198,0.0,0.002595,0.0,0.02094,0.0,0.976161,0.871979,0.128021
1,0.0,1.5,0.5,49.0,0.0,0.0,0.0,10.5,38.0,7.0,...,1.5e-05,0.0,0.0,0.000305,0.0,0.01474,0.0,0.98494,0.748596,0.251404
2,7399.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000198,0.0,0.014801,0.0,0.985001,0.762604,0.237396
3,0.0,0.0,84.0,3.0,2.0,0.0,3.0,0.0,0.0,0.0,...,7.6e-05,6.1e-05,0.0,0.000916,0.0,0.017397,0.0,0.98155,0.878311,0.121689
4,0.0,6385.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.5e-05,0.0,0.000137,0.0,0.014496,0.0,0.985351,0.891022,0.108978


In [19]:
# training set, testing set and validation set
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.4, random_state=42)
val_data, test_data, val_labels, test_labels = train_test_split(test_data, test_labels, test_size=0.5, random_state=42)

In [20]:
# check on training set, testing set and validation set
print("Train Data:")
train_data.head(132)

Train Data:


Unnamed: 0,area_0,area_1,area_2,area_3,area_4,area_5,area_6,area_7,area_8,area_9,...,texture_0,texture_1,texture_2,texture_3,texture_4,texture_5,texture_6,texture_7,color_0,color_511
146,2040.0,0.0,0.0,0.0,2.0,11.5,0.0,0.0,0.0,0.0,...,0.000229,0.000244,0.000000,0.006104,0.000000,0.029362,0.000000,0.964060,0.572495,0.427505
173,0.0,0.0,0.0,0.0,0.0,0.5,2.0,0.0,0.0,0.0,...,0.000764,0.001513,0.000000,0.020743,0.000031,0.062564,0.000061,0.914324,0.662613,0.337387
206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.5,...,0.000489,0.001009,0.000000,0.017870,0.000031,0.053304,0.000031,0.927267,0.565598,0.434402
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000107,0.000198,0.000000,0.002595,0.000000,0.020940,0.000000,0.976161,0.871979,0.128021
2,7399.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000198,0.000000,0.014801,0.000000,0.985001,0.762604,0.237396
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000336,0.000657,0.000000,0.014036,0.000015,0.051225,0.000015,0.933716,0.700333,0.299667
14,14558.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,14554.5,0.0,...,0.000000,0.000000,0.000000,0.000092,0.000000,0.012115,0.000000,0.987793,0.546646,0.453354
92,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000474,0.001132,0.000015,0.019434,0.000000,0.054444,0.000031,0.924469,0.377563,0.622437
179,0.0,0.5,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000382,0.000749,0.000000,0.013708,0.000015,0.040544,0.000031,0.944571,0.566971,0.433029


In [21]:
print("Test Data:")
test_data.head(44)

Test Data:


Unnamed: 0,area_0,area_1,area_2,area_3,area_4,area_5,area_6,area_7,area_8,area_9,...,texture_0,texture_1,texture_2,texture_3,texture_4,texture_5,texture_6,texture_7,color_0,color_511
84,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000505,0.001162,0.0,0.021271,4.6e-05,0.056595,6.1e-05,0.92036,0.418488,0.581512
132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.5,...,0.000275,0.00058,0.0,0.011518,0.0,0.036936,1.5e-05,0.950675,0.395157,0.604843
97,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000504,0.000641,0.0,0.016032,3.1e-05,0.050677,6.1e-05,0.932054,0.676208,0.323792
210,0.0,10.5,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0.000397,0.000474,0.0,0.014068,0.0,0.045076,0.0,0.939985,0.706055,0.293945
190,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,...,0.00029,0.000611,0.0,0.008829,1.5e-05,0.029434,9.2e-05,0.960729,0.212753,0.787247
65,3570.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000475,0.001134,0.0,0.020277,3.1e-05,0.050976,6.1e-05,0.927046,0.387375,0.612625
100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000352,0.000764,0.0,0.017409,0.0,0.053281,7.6e-05,0.928118,0.429932,0.570068
197,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.0,0.0,...,0.000794,0.000672,0.0,0.023644,4.6e-05,0.053948,3.1e-05,0.920865,0.67659,0.32341
15,3825.0,0.0,1.0,0.0,1.0,0.5,0.0,39.0,101.0,3572.5,...,0.0,1.5e-05,0.0,0.000214,0.0,0.014878,0.0,0.984894,0.877487,0.122513
112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000198,0.000412,0.0,0.00713,1.5e-05,0.03032,0.0,0.961925,0.613571,0.386429


In [22]:
print("Validation Data:")
val_data.head(44)

Validation Data:


Unnamed: 0,area_0,area_1,area_2,area_3,area_4,area_5,area_6,area_7,area_8,area_9,...,texture_0,texture_1,texture_2,texture_3,texture_4,texture_5,texture_6,texture_7,color_0,color_511
180,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.000581,0.001161,0.0,0.019575,7.6e-05,0.061842,7.6e-05,0.916688,0.648956,0.351044
18,6398.0,0.0,6385.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.1e-05,0.0,0.013916,0.0,0.986023,0.796829,0.203171
141,3315.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.5,...,9.2e-05,0.000122,0.0,0.004151,0.0,0.023593,0.0,0.972043,0.631607,0.368393
75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000275,0.000412,0.0,0.010211,3.1e-05,0.035471,4.6e-05,0.953555,0.612274,0.387726
186,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00055,0.000779,0.0,0.014166,4.6e-05,0.04621,0.000107,0.938142,0.586349,0.413651
172,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,...,0.000596,0.001361,0.0,0.023852,4.6e-05,0.059707,9.2e-05,0.914346,0.672211,0.327789
19,0.0,0.0,0.0,0.0,10.5,0.0,0.0,4.0,0.0,0.0,...,0.000244,0.000198,0.0,0.005509,1.5e-05,0.032184,1.5e-05,0.961834,0.888565,0.111435
82,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,6.0,1.5,...,0.000657,0.000825,0.0,0.019704,4.6e-05,0.056271,4.6e-05,0.922452,0.505432,0.494568
150,0.0,0.0,0.0,5.5,0.0,0.0,0.0,0.0,0.0,0.0,...,7.6e-05,7.6e-05,0.0,0.003815,0.0,0.022325,0.0,0.973708,0.534256,0.465744
26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.5,0.0,...,0.000122,0.000122,0.0,0.003863,0.0,0.028325,0.0,0.967568,0.933762,0.066238


## Classification Modelling (SVM, Random Forest, Naive Bayes)

In [23]:
svm_classifier = SVC()
rf_classifier = RandomForestClassifier()
nb_classifier = GaussianNB()

classifiers = [svm_classifier, rf_classifier, nb_classifier]

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='weighted', zero_division=1),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1_score': make_scorer(f1_score, average='weighted')
}

# k-fold validation
# k = 10 
# results = {}
# for classifier in classifiers:
#     classifier_name = type(classifier).__name__
#     print("Running k-fold cross-validation for", classifier_name)
#     cv_results = cross_validate(classifier, train_data, train_labels, cv=k, scoring=scoring)
#     results[classifier_name] = cv_results

# for classifier_name, cv_results in results.items():
#     print("\nResults for", classifier_name)
#     print("Accuracy:", cv_results['test_accuracy'])
#     print("Precision:", cv_results['test_precision'])
#     print("Recall:", cv_results['test_recall'])
#     print("F1 Score:", cv_results['test_f1_score'])
    
k = 10
results = {}
for classifier in classifiers:
    classifier_name = type(classifier).__name__
    print("Running k-fold cross-validation for", classifier_name)
    cv_results = cross_validate(classifier, train_data, train_labels, cv=k, scoring=scoring)
    results[classifier_name] = cv_results

for classifier_name, cv_results in results.items():
    print("\nResults for", classifier_name)
    print("Accuracy:", np.mean(cv_results['test_accuracy']))
    print("Precision:", np.mean(cv_results['test_precision']))
    print("Recall:", np.mean(cv_results['test_recall']))
    print("F1 Score:", np.mean(cv_results['test_f1_score']))

Running k-fold cross-validation for SVC
Running k-fold cross-validation for RandomForestClassifier
Running k-fold cross-validation for GaussianNB

Results for SVC
Accuracy: 0.5318681318681319
Precision: 0.6137271062271064
Recall: 0.5318681318681319
F1 Score: 0.48877987092272807

Results for RandomForestClassifier
Accuracy: 0.7032967032967032
Precision: 0.7486172161172162
Recall: 0.7032967032967032
F1 Score: 0.688558227486799

Results for GaussianNB
Accuracy: 0.3412087912087912
Precision: 0.4098809523809524
Recall: 0.3412087912087912
F1 Score: 0.3379528408099837


In [24]:
# training model
svm_classifier.fit(train_data, train_labels)
rf_classifier.fit(train_data, train_labels)
nb_classifier.fit(train_data, train_labels)

# validating model
svm_val_predictions = svm_classifier.predict(val_data)
rf_val_predictions = rf_classifier.predict(val_data)
nb_val_predictions = nb_classifier.predict(val_data)

svm_val_accuracy = accuracy_score(val_labels, svm_val_predictions)
svm_val_precision = precision_score(val_labels, svm_val_predictions, average='weighted')
svm_val_recall = recall_score(val_labels, svm_val_predictions, average='weighted')
svm_val_f1_score = f1_score(val_labels, svm_val_predictions, average='weighted')

rf_val_accuracy = accuracy_score(val_labels, rf_val_predictions)
rf_val_precision = precision_score(val_labels, rf_val_predictions, average='weighted')
rf_val_recall = recall_score(val_labels, rf_val_predictions, average='weighted')
rf_val_f1_score = f1_score(val_labels, rf_val_predictions, average='weighted')

nb_val_accuracy = accuracy_score(val_labels, nb_val_predictions)
nb_val_precision = precision_score(val_labels, nb_val_predictions, average='weighted')
nb_val_recall = recall_score(val_labels, nb_val_predictions, average='weighted')
nb_val_f1_score = f1_score(val_labels, nb_val_predictions, average='weighted')

svm_val_confusion_matrix = confusion_matrix(val_labels, svm_val_predictions)
rf_val_confusion_matrix = confusion_matrix(val_labels, rf_val_predictions)
nb_val_confusion_matrix = confusion_matrix(val_labels, nb_val_predictions)

In [25]:
# testing model
svm_test_predictions = svm_classifier.predict(test_data)
rf_test_predictions = rf_classifier.predict(test_data)
nb_test_predictions = nb_classifier.predict(test_data)

svm_test_accuracy = accuracy_score(test_labels, svm_test_predictions)
svm_test_precision = precision_score(test_labels, svm_test_predictions, average='weighted')
svm_test_recall = recall_score(test_labels, svm_test_predictions, average='weighted')
svm_test_f1_score = f1_score(test_labels, svm_test_predictions, average='weighted')

rf_test_accuracy = accuracy_score(test_labels, rf_test_predictions)
rf_test_precision = precision_score(test_labels, rf_test_predictions, average='weighted')
rf_test_recall = recall_score(test_labels, rf_test_predictions, average='weighted')
rf_test_f1_score = f1_score(test_labels, rf_test_predictions, average='weighted')

nb_test_accuracy = accuracy_score(test_labels, nb_test_predictions)
nb_test_precision = precision_score(test_labels, nb_test_predictions, average='weighted')
nb_test_recall = recall_score(test_labels, nb_test_predictions, average='weighted')
nb_test_f1_score = f1_score(test_labels, nb_test_predictions, average='weighted')

svm_test_confusion_matrix = confusion_matrix(test_labels, svm_test_predictions)
rf_test_confusion_matrix = confusion_matrix(test_labels, rf_test_predictions)
nb_test_confusion_matrix = confusion_matrix(test_labels, nb_test_predictions)

In [26]:
# calculate accuracy, precision, recall, f1 score and confusion metric
print("\nValidation Set Metrics:")
print("SVM Accuracy:", svm_val_accuracy)
print("SVM Precision:", svm_val_precision)
print("SVM Recall:", svm_val_recall)
print("SVM F1 Score:", svm_val_f1_score)
print("SVM Confusion Matrix:\n", svm_val_confusion_matrix)
print("\nRandom Forest Accuracy:", rf_val_accuracy)
print("Random Forest Precision:", rf_val_precision)
print("Random Forest Recall:", rf_val_recall)
print("Random Forest F1 Score:", rf_val_f1_score)
print("Random Forest Confusion Matrix:\n", rf_val_confusion_matrix)
print("\nNaive Bayes Accuracy:", nb_val_accuracy)
print("Naive Bayes Precision:", nb_val_precision)
print("Naive Bayes Recall:", nb_val_recall)
print("Naive Bayes F1 Score:", nb_val_f1_score)
print("Naive Bayes Confusion Matrix:\n", nb_val_confusion_matrix)


print("\nTesting Set Metrics:")
print("SVM Accuracy:", svm_test_accuracy)
print("SVM Precision:", svm_test_precision)
print("SVM Recall:", svm_test_recall)
print("SVM F1 Score:", svm_test_f1_score)
print("SVM Confusion Matrix:\n", svm_test_confusion_matrix)
print("\nRandom Forest Accuracy:", rf_test_accuracy)
print("Random Forest Precision:", rf_test_precision)
print("Random Forest Recall:", rf_test_recall)
print("Random Forest F1 Score:", rf_test_f1_score)
print("Random Forest Confusion Matrix:\n", rf_test_confusion_matrix)
print("\nNaive Bayes Accuracy:", nb_test_accuracy)
print("Naive Bayes Precision:", nb_test_precision)
print("Naive Bayes Recall:", nb_test_recall)
print("Naive Bayes F1 Score:", nb_test_f1_score)
print("Naive Bayes Confusion Matrix:\n", nb_test_confusion_matrix)


Validation Set Metrics:
SVM Accuracy: 0.36363636363636365
SVM Precision: 0.5052447552447552
SVM Recall: 0.36363636363636365
SVM F1 Score: 0.30837401746492654
SVM Confusion Matrix:
 [[10  0  0  1]
 [ 3  2  1  7]
 [ 8  0  1  1]
 [ 5  0  2  3]]

Random Forest Accuracy: 0.6818181818181818
Random Forest Precision: 0.7305147058823529
Random Forest Recall: 0.6818181818181818
Random Forest F1 Score: 0.6881344776081618
Random Forest Confusion Matrix:
 [[9 0 0 2]
 [0 5 0 8]
 [1 0 9 0]
 [0 3 0 7]]

Naive Bayes Accuracy: 0.3181818181818182
Naive Bayes Precision: 0.3619560264297106
Naive Bayes Recall: 0.3181818181818182
Naive Bayes F1 Score: 0.31483765319960083
Naive Bayes Confusion Matrix:
 [[6 1 0 4]
 [4 3 3 3]
 [4 0 3 3]
 [5 2 1 2]]

Testing Set Metrics:
SVM Accuracy: 0.38636363636363635
SVM Precision: 0.49913419913419915
SVM Recall: 0.38636363636363635
SVM F1 Score: 0.3519385026737968
SVM Confusion Matrix:
 [[8 0 0 1]
 [4 3 0 5]
 [6 0 2 5]
 [3 2 1 4]]

Random Forest Accuracy: 0.636363636363636