# EDA



### Load and clean data

- Add classwise accuracy! (equals to IoU)

In [None]:
# Import packages
import pandas as pd
import glob
from matplotlib import pyplot as plt
import os
import cv2
import random
import numpy as np
import sys
import seaborn as sns
import ptitprince as pt
from patchify import patchify

sys.path.append('../modeling')
from baseline_model import baseline_model

sys.path.append('../modeling')
from predict import single_image_IoU, map_func, patch_masks

In [None]:
# Load the dataframe names into a list
path_corrected = '../data/areas/corrected'
filenames_corrected = glob.glob(path_corrected + "/*.csv")
filenames_corrected.sort()

path_uncorrected = '../data/areas/uncorrected'
filenames_uncorrected = glob.glob(path_uncorrected + "/*.csv")
filenames_uncorrected.sort()

In [None]:
# Load the dataframes into a list
dataframes_corr_list = []
for i in range(len(filenames_corrected)):
    temp_df = pd.read_csv(filenames_corrected[i])
    dataframes_corr_list.append(temp_df)

dataframes_uncorr_list = []
for i in range(len(filenames_uncorrected)):
    temp_df = pd.read_csv(filenames_uncorrected[i])
    dataframes_uncorr_list.append(temp_df)

In [None]:
# Replace 1,2,3 with Fracture, Pore, Tiny Pore
for i in range(len(dataframes_uncorr_list)):    
    temp_df = dataframes_uncorr_list[i]
    temp_df['label_id'].replace([1,2,3], ['Fracture', 'Pore', 'Tiny Pore'], inplace=True)
    dataframes_uncorr_list[i] == temp_df

for i in range(len(dataframes_corr_list)):
    temp_df = dataframes_corr_list[i]
    temp_df['label_id'].replace([1,2,3], ['Fracture', 'Pore', 'Tiny Pore'], inplace=True)
    dataframes_corr_list[i] == temp_df

### Data Image EDA

In [None]:
# load images and sort masks and images
train_img_dir = '../data/data_train/train/images/train/'
train_mask_dir = '../data/data_train/train/masks/train/'

img_list = os.listdir(train_img_dir)
img_list.sort()
msk_list = os.listdir(train_mask_dir)
msk_list.sort()
num_images = len(os.listdir(train_img_dir))
print("Total number of training images are: ", num_images)

In [None]:
# Plot of some images as a sanity check
img_num = random.randint(0, num_images-1)

img_for_plot = cv2.imread(train_img_dir+img_list[img_num], 0)
mask_for_plot =cv2.imread(train_mask_dir+msk_list[img_num], 0)

plt.figure(figsize=(12, 8))
plt.subplot(121)
plt.imshow(img_for_plot, cmap='gray')
plt.title('Image')
plt.subplot(122)
plt.imshow(mask_for_plot, cmap='gray')
plt.title('Mask')
plt.show()

In [None]:
print("Unique values in the mask are: ", np.unique(mask_for_plot))

### Object classification and baseline error analysis

In [None]:
# All elements of uncorrected list
all_elements_u = 0
elements_per_image = []
for i in range(len(dataframes_uncorr_list)):    
    temp_df = dataframes_uncorr_list[i]
    elements = len(temp_df)
    elements_per_image.append(elements)
    all_elements_u += elements

print(f'The total number of uncorrected elements are: {all_elements_u}')
print(f'The number of uncorrected elements per image are: {*elements_per_image,}')

In [None]:
# All elements of corrected list
all_elements_c = 0
elements_per_image = []
for i in range(len(dataframes_corr_list)):    
    temp_df = dataframes_corr_list[i]
    elements = len(temp_df)
    elements_per_image.append(elements)
    all_elements_c += elements   

print(f'The total number of corrected elements are: {all_elements_c}')
print(f'The number of corrected elements per image are: {*elements_per_image,}')
print(f'The total number of deleted elements is equal to {all_elements_u - all_elements_c}')
print(f'.... this is equal to {round(((all_elements_u - all_elements_c)/all_elements_u)*100, 2)}% of all elements')

In [None]:
# Number of elements per class
total_frac_size = 0
total_pore_size = 0
total_tiny_pore_size = 0

for i in range(len(dataframes_corr_list)):    
    temp_df = dataframes_corr_list[i]
    temp_sizes = temp_df.groupby('label_id').count().reset_index()
    total_frac_size += temp_sizes.at[0, 'grain_id']
    total_pore_size += temp_sizes.at[1, 'grain_id']
    total_tiny_pore_size += temp_sizes.at[2, 'grain_id']

print(f'The total number of fractures are equal to {total_frac_size}')
print(f'The total number of pores are equal to {total_pore_size}')
print(f'The total number of tiny pores are equal to {total_tiny_pore_size}')

In [None]:
#define data
data = [total_frac_size, total_pore_size, total_tiny_pore_size]
labels = ['Fractures', 'Pores', 'Tiny Pores']

#define Seaborn color palette to use
colors = sns.color_palette('pastel')[0:5]

#create pie chart
plt.figure(figsize=(10,10))
plt.pie(data, labels = labels, colors = colors, autopct='%.0f%%', explode=[0.05]*3, pctdistance=0.5)
plt.title('Relative amount of objects')
plt.show()

In [None]:
# Total area for each class
# Number of elements per class
total_frac_area = 0
total_pore_area = 0
total_tiny_pore_area = 0

for i in range(len(dataframes_corr_list)):    
    temp_df = dataframes_corr_list[i]
    temp_sizes = temp_df.groupby('label_id').sum().reset_index()
    total_frac_area += temp_sizes.at[0, 'area']
    total_pore_area += temp_sizes.at[1, 'area']
    total_tiny_pore_area += temp_sizes.at[2, 'area']

print(f'The total area of fractures is equal to {round(total_frac_area/(total_frac_area+total_pore_area+total_tiny_pore_area)*100)}%')
print(f'The total area of pores is equal to {round(total_pore_area/(total_frac_area+total_pore_area+total_tiny_pore_area)*100)}%')
print(f'The total area of tiny pores is equal to {round(total_tiny_pore_area/(total_frac_area+total_pore_area+total_tiny_pore_area)*100)}%')



In [None]:
#define data
data = [total_frac_area, total_pore_area, total_tiny_pore_area]
labels = ['Fractures', 'Pores', 'Tiny Pores']

#define Seaborn color palette to use
colors = sns.color_palette('pastel')[0:5]

#create pie chart
plt.figure(figsize=(10,10))
plt.pie(data, labels = labels, colors = colors, autopct='%.0f%%', explode=[0.05]*3, pctdistance=0.5)
plt.title('Relative area of classes')
plt.show()

### Baseline model
The current approach employs a CNN for segmentation and a simple decision tree for labelling.
The labelling is then later corrected by hand. The difference between the original labelling and the corrections will give us an estimate of the current error (minus error introduced by human error).

In [None]:
# Difference in area for both classes
frac_error = []
pore_error = []

for i in range(len(dataframes_corr_list)):
    # Get the area of the corrected images
    temp_df_corr = dataframes_corr_list[i]
    temp_sizes_corr = temp_df_corr.groupby('label_id').sum().reset_index()
    total_frac_area_corr = temp_sizes_corr.at[0, 'area']
    total_pore_area_corr = temp_sizes_corr.at[1, 'area']
    # Get the area of the uncorrected images
    temp_df_uncorr = dataframes_uncorr_list[i]
    temp_sizes_uncorr = temp_df_uncorr.groupby('label_id').sum().reset_index()
    total_frac_area_uncorr = temp_sizes_uncorr.at[0, 'area']
    total_pore_area_uncorr = temp_sizes_uncorr.at[1, 'area']
    # Get the difference 
    total_frac_difference = abs(total_frac_area_corr-total_frac_area_uncorr)/total_frac_area_uncorr
    total_pore_difference = abs(total_pore_area_corr-total_pore_area_uncorr)/total_pore_area_uncorr

    percent_frac_difference = round(total_frac_difference*100)
    percent_pore_difference = round(total_pore_difference*100)

    frac_error.append(percent_frac_difference)
    pore_error.append(percent_pore_difference)

print(f'The error of classification (%) for fractures within each image is equal to: {*frac_error,}')
print(f'The error of classification (%) for pores within each image is equal to: {*pore_error,}')

mean_error_frac = sum(frac_error)/len(frac_error)
mean_error_pore = sum(pore_error)/len(pore_error)

print(f'The mean area error for fractures is equal to {round(mean_error_frac)}%')
print(f'The mean area error for pores is equal to {round(mean_error_pore)}%')

In [None]:
data = [frac_error, pore_error]
labels = ['Fractures', 'Pores']

fig, ax = plt.subplots()
ax.set_title('Mean error of area for both classes')
ax.set_xlabel('Error in area (%)')
ax.set_yticklabels(labels)
ax.boxplot(data, vert=False)

plt.show()

In [None]:
data = [frac_error, pore_error]
frac_label = 'Fractures '*len(frac_error)
frac_label = frac_label.split(' ', len(frac_error)-1)
frac_label[-1] = 'Fractures'

pore_label = 'Pores '*len(pore_error)
pore_label = pore_label.split(' ', len(pore_error)-1)
pore_label[-1] = 'Pores'

df = pd.DataFrame(
    {'error': frac_error+pore_error,
     'type': frac_label+pore_label,
    })


In [None]:
f, ax = plt.subplots(figsize=(12, 7))

ax=pt.half_violinplot( x = 'type', y = 'error', data = df, palette=['salmon','darkcyan'],
                scale = "area", width = .6, inner = None, orient = "v")
ax=sns.stripplot(x = 'type', y = 'error', data = df, palette=['salmon','darkcyan'],
                size = 8, jitter = 1, orient = "v")
ax=sns.boxplot(x = 'type', y = 'error', data = df, color = "black", width = .15, zorder = 10,
                showcaps = True, boxprops = {'facecolor':'none', "zorder":10},
                showfliers=False, saturation = 1, orient = "v")

ax.set_title('Mean error of area for both classes')
    
#set y axis label
plt.ylabel('Error [%]')
#set x axis label
plt.xlabel('')
plt.tight_layout()
plt.show(f)

In [None]:
# Wrongly classified objects in total and percent
frac_error = []
pore_error = []

for i in range(len(dataframes_corr_list)):
    # Get the area of the corrected images
    temp_df_corr = dataframes_corr_list[i]
    temp_number_corr = temp_df_corr.groupby('label_id').count().reset_index()
    total_frac_corr = temp_number_corr.at[0, 'grain_id']
    total_pore_corr = temp_number_corr.at[1, 'grain_id']

    # Get the area of the uncorrected images
    temp_df_uncorr = dataframes_uncorr_list[i]
    temp_sizes_uncorr = temp_df_uncorr.groupby('label_id').count().reset_index()
    #print(temp_sizes_uncorr)
    total_frac_uncorr = temp_sizes_uncorr.at[0, 'grain_id']
    total_pore_uncorr = temp_sizes_uncorr.at[1, 'grain_id']
    # Get the difference 
    total_frac_difference = abs(total_frac_corr-total_frac_uncorr)/total_frac_uncorr
    total_pore_difference = abs(total_pore_corr-total_pore_uncorr)/total_pore_uncorr

    percent_frac_difference = round(total_frac_difference*100)
    percent_pore_difference = round(total_pore_difference*100)

    frac_error.append(percent_frac_difference)
    pore_error.append(percent_pore_difference)



In [None]:
print(f'The error of classification (%) for fractures within each image is equal to: {*frac_error,}')
print(f'The error of classification (%) for pores within each image is equal to: {*pore_error,}')

mean_error_frac = sum(frac_error)/len(frac_error)
mean_error_pore = sum(pore_error)/len(pore_error)

print(f'The mean area error for fractures is equal to {round(mean_error_frac)}%')
print(f'The mean area error for pores is equal to {round(mean_error_pore)}%')

In [None]:
data = [frac_error, pore_error]
labels = ['Fractures', 'Pores']

fig, ax = plt.subplots()
ax.set_title('Mean error of area for both classes')
ax.set_xlabel('Error in area (%)')
ax.set_yticklabels(labels)
ax.boxplot(data, vert=False)

plt.show()

### Calculation on images

In [None]:
# Calculate the pixelwise accuracy between each corrected and uncorrected image
corr_masks = "../data/baseline/corrected_masks/"
uncorrected_masks = "../data/baseline/original_masks/"

img_list = os.listdir(corr_masks)
img_list.sort()
msk_list = os.listdir(uncorrected_masks)
msk_list.sort()
num_images = len(os.listdir(corr_masks))
print("Total number of training images are: ", num_images)

In [None]:
baseline_model(corr_masks, uncorrected_masks, img_list, msk_list)
# The overall pixelwise accuracy is pretty good (99%). However, this is misleading since so much of that 
# accuracy is based on the background (which is not corrected)

In [None]:
accuracy = [99.3, 99.37, 99.02, 98.95, 99.2, 99.07, 99.1, 99.3]
data = accuracy

fig, ax = plt.subplots()
ax.set_title('Pixelwise Accuracy')
ax.boxplot(data)

plt.show()

In [None]:
label = 'accuracy '*len(data)
label = label.split(' ', len(data)-1)
label[-1] = 'accuracy'

df = pd.DataFrame(
    {'accuracy': data,
     'type': label,
    })

In [None]:
f, ax = plt.subplots(figsize=(12, 7))

ax=pt.half_violinplot( x = 'type', y = 'accuracy', data = df, palette=['salmon','darkcyan'],
                scale = "area", width = .6, inner = None, orient = "v")
ax=sns.stripplot(x = 'type', y = 'accuracy', data = df, palette=['salmon','darkcyan'],
                size = 8, jitter = 1, orient = "v")
ax=sns.boxplot(x = 'type', y = 'accuracy', data = df, color = "black", width = .15, zorder = 10,
                showcaps = True, boxprops = {'facecolor':'none', "zorder":10},
                showfliers=False, saturation = 1, orient = "v")

ax.set_title('Pixelwise accuracy for all images')
    
#set y axis label
plt.ylabel('Accuracy [%]')
#set x axis label
plt.xlabel('')
plt.tight_layout()
plt.show(f)

A total of 1020 image of size 512*512 exist for Training purposes

There are four distinct grayscale values of the four different classes.

#### Create IoU's for the baseline model (comparison between autocreated masks and corrected masks)

In [None]:
#Load all test images and masks
true_msk_dir = '../data/baseline/corrected_masks/'
baseline_msk_dir = '../data/baseline/original_masks/'

true_msk_list = os.listdir(true_msk_dir)
true_msk_list.sort()
baseline_msk_list = os.listdir(baseline_msk_dir)
baseline_msk_list.sort()

#Capture training image info as a list
true_msk = []
for i in range(len(true_msk_list)):
    image_name = true_msk_dir + true_msk_list[i]
    img = cv2.imread(image_name, 0)       
    true_msk.append(img)

#Capture mask/label info as a list
baseline_msk = []
for i in range(len(baseline_msk_list)):
    image_name = baseline_msk_dir + baseline_msk_list[i]
    img = cv2.imread(image_name, 0)       
    baseline_msk.append(img)

In [None]:
# One-hot encode masks
oh_dict = {28:1, 124:2, 222:3}
true_msk_oh = []

for i in range(len(true_msk)):
    single_mask = true_msk[i]
    single_mask = np.array(single_mask)
    # vectorize and run map_func
    vfunc  = np.vectorize(map_func)
    true_msk_oh.append(vfunc(single_mask, oh_dict))

baseline_msk_oh = []

for i in range(len(baseline_msk)):
    single_mask = baseline_msk[i]
    single_mask = np.array(single_mask)
    # vectorize and run map_func
    vfunc  = np.vectorize(map_func)
    baseline_msk_oh.append(vfunc(single_mask, oh_dict))

In [None]:
# Instantiate image wise IoU lists
iw_baseline_IoU = []
c1_baseline_IoU = []
c2_baseline_IoU = []
c3_baseline_IoU = []
c4_baseline_IoU = []

for i in range(len(true_msk_oh)):
    # Load single image
    print(f'--------Calc. image nr.{i+1}--------')
    single_true_msk = true_msk_oh[i]
    single_baseline_msk = baseline_msk_oh[i]

    # Creates patches
    patches_true = patchify(single_true_msk, (512, 512), step=512) 
    patches_base = patchify(single_baseline_msk, (512, 512), step=512)

    # Make predictions and save predictions: Predict and flatten all 512*512 patches of one image and one mask
    print('creating patches...')
    output_true_patches, output_baseline_patches = patch_masks(patches_true, patches_base)
    print('finished creating patches')

    #Calculating class IoUs for a single image and append
    print('calculating IoUs...')  
    mean_IoU, class_1_IoU, class_2_IoU, class_3_IoU, class_4_IoU = single_image_IoU(output_true_patches, output_baseline_patches)
    
    iw_baseline_IoU.append(round(sum(mean_IoU)/len(mean_IoU), 2))  
    c1_baseline_IoU.append(round(sum(class_1_IoU)/len(class_1_IoU), 2))
    c2_baseline_IoU.append(round(sum(class_2_IoU)/len(class_2_IoU), 2))
    c3_baseline_IoU.append(round(sum(class_3_IoU)/len(class_3_IoU), 2))
    c4_baseline_IoU.append(round(sum(class_4_IoU)/len(class_4_IoU), 2))
    print('finished calculating IoUs')

In [None]:
# Mean IoU's of the baseline model
iw_mean_IoU_baseline = round(sum(iw_baseline_IoU)/len(iw_baseline_IoU), 2)
c1_mean_IoU_baseline = round(sum(c1_baseline_IoU)/len(c1_baseline_IoU), 2)
c2_mean_IoU_baseline = round(sum(c2_baseline_IoU)/len(c2_baseline_IoU), 2)
c3_mean_IoU_baseline = round(sum(c3_baseline_IoU)/len(c3_baseline_IoU), 2)
c4_mean_IoU_baseline = round(sum(c4_baseline_IoU)/len(c4_baseline_IoU), 2)

In [None]:
IoU_baseline = [iw_mean_IoU_baseline, c1_mean_IoU_baseline, c2_mean_IoU_baseline, c3_mean_IoU_baseline, c4_mean_IoU_baseline]

In [None]:
IoU_baseline