# Labeling

Generate a label array of length 16/32/64/128

In [1]:
import cv2
import os
import numpy as np
import matplotlib.pyplot as plt
import csv
import glob
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support

## An all-in-one function combine all four methods below

In [52]:
# 4 modes
PRECISION_50 = 1
PRECISION_X = 2
FSCORE_SOLO = 3
PRECISION_SOLO = 4

def auto_label(seg_model, seg_nd, cluster_num, mode, threshold=0.5):
    # cluster_num: the total number of clusters
    assert mode in [1,2,3,4], "Invalid mode: mode should be integer in [1,2,3,4]."
    if mode == 1:
        assert threshold == 0.5, "Mode 1 requires threshold = 0.5."

    csv_file = os.path.join(os.getcwd(), 'evaluation_rec_f1', '{}_{}_{}_f1.csv'.format(seg_model, seg_nd, cluster_num))
    df = pd.read_csv(csv_file, usecols=['slice', 'current_cluster','pore_micro_precision', 'pore_micro_f1', 'gypsum_micro_precision', 'gypsum_micro_f1', 'celestite_micro_precision', 'celestite_micro_f1', 'bassanite_micro_precision', 'bassanite_micro_f1'])
    
    label = [0]*cluster_num

    for i in range(cluster_num):
        one_cluster = df.loc[df['current_cluster'] == i]
        stats = one_cluster.mean()
        precisions = [stats[2], stats[4], stats[6], stats[8]]
        fscores = [stats[3], stats[5], stats[7], stats[9]]
        p_max = max(precisions)
        p_idx = np.argmax(precisions)
        f_idx = np.argmax(fscores)
        if mode in [1,2]:
            if p_max <= threshold:
                idx = f_idx
            else:
                idx = p_idx
        elif mode == 3:
            idx = f_idx
        else:
            idx = p_idx
            
        class_num = idx + 1
        label[i] = class_num

    return label

    

In [57]:
seg_model = 'k-means'   # choose between 'gmm' and 'k-means'
seg_nd = '4d'   # choose between '3d' and '4d'
cluster_num = 16   # choose between 16, 32, 64, and 128

auto_label(seg_model, seg_nd, cluster_num, PRECISION_X, threshold=0.7)

[2, 3, 4, 2, 2, 1, 4, 2, 1, 2, 1, 2, 4, 2, 3, 2]

### Method 1:

1. Label as the class whose precison over 50% [this threshold could be adjusted (see method 2)]

2. If no such class, then choose the one with the highest macro f1-score

In [2]:
# Set the target segmentation results 
res_folder = 'large_clusters_rec'
seg_model = 'k-means'   # choose between 'gmm' and 'k-means'
seg_nd = '4d'   # choose between '3d' and '4d'
cluster_num = 16   # choose between 16, 32, 64, and 128

# Data path
base_folder = os.path.join(os.getcwd(), res_folder, seg_model, seg_nd, 'cluster_{}'.format(cluster_num))

# corresponding label csv file
csv_file = os.path.join(os.getcwd(), 'evaluation_rec_f1', '{}_{}_{}_f1.csv'.format(seg_model, seg_nd, cluster_num))

# read csv file
#df = pd.read_csv(csv_file, usecols = ['slice', 'current_cluster'])
df = pd.read_csv(csv_file, usecols=['slice', 'current_cluster','pore_micro_precision', 'pore_micro_f1', 'gypsum_micro_precision', 'gypsum_micro_f1', 'celestite_micro_precision', 'celestite_micro_f1', 'bassanite_micro_precision', 'bassanite_micro_f1'])

# initialise label array
label = [0]*cluster_num

In [3]:
for i in range(cluster_num):
    one_cluster = df.loc[df['current_cluster'] == i]
    stats = one_cluster.mean()
    precisions = [stats[2], stats[4], stats[6], stats[8]]
    fscores = [stats[3], stats[5], stats[7], stats[9]]
    p_max = max(precisions)
    if p_max <= 0.5:
        idx = fscores.index(max(fscores))
    else:
        idx = precisions.index(p_max)

    class_num = idx + 1
    label[i] = class_num


In [4]:
label

[2, 2, 4, 2, 2, 2, 4, 2, 1, 2, 1, 2, 2, 2, 3, 2]

### Method 2:

1. Label as the class whose precison over x%

2. If no such class, then choose the one with the highest macro f1-score

In [41]:
# Set the target segmentation results 
res_folder = 'large_clusters_rec'
seg_model = 'k-means'   # choose between 'gmm' and 'k-means'
seg_nd = '4d'   # choose between '3d' and '4d'
cluster_num = 16   # choose between 16, 32, 64, and 128

# Data path
base_folder = os.path.join(os.getcwd(), res_folder, seg_model, seg_nd, 'cluster_{}'.format(cluster_num))

# corresponding label csv file
csv_file = os.path.join(os.getcwd(), 'evaluation_rec_f1', '{}_{}_{}_f1.csv'.format(seg_model, seg_nd, cluster_num))

# read csv file
#df = pd.read_csv(csv_file, usecols = ['slice', 'current_cluster'])
df = pd.read_csv(csv_file, usecols=['slice', 'current_cluster','pore_micro_precision', 'pore_micro_f1', 'gypsum_micro_precision', 'gypsum_micro_f1', 'celestite_micro_precision', 'celestite_micro_f1', 'bassanite_micro_precision', 'bassanite_micro_f1'])

# initialise label array
label = [0]*cluster_num

In [42]:
THRESHOLD = 0.7
for i in range(cluster_num):
    one_cluster = df.loc[df['current_cluster'] == i]
    stats = one_cluster.mean()
    precisions = [stats[2], stats[4], stats[6], stats[8]]
    fscores = [stats[3], stats[5], stats[7], stats[9]]
    p_max = max(precisions)
    if p_max <= THRESHOLD:
        idx = fscores.index(max(fscores))
    else:
        idx = precisions.index(p_max)

    class_num = idx + 1
    label[i] = class_num

In [43]:
label

[2, 3, 4, 2, 2, 1, 4, 2, 1, 2, 1, 2, 4, 2, 3, 2]

### Method 3:

Sololy based on f1-score

In [38]:
# Set the target segmentation results 
res_folder = 'large_clusters_rec'
seg_model = 'k-means'   # choose between 'gmm' and 'k-means'
seg_nd = '4d'   # choose between '3d' and '4d'
cluster_num = 16   # choose between 16, 32, 64, and 128

# Data path
base_folder = os.path.join(os.getcwd(), res_folder, seg_model, seg_nd, 'cluster_{}'.format(cluster_num))

# corresponding label csv file
csv_file = os.path.join(os.getcwd(), 'evaluation_rec_f1', '{}_{}_{}_f1.csv'.format(seg_model, seg_nd, cluster_num))

# read csv file
#df = pd.read_csv(csv_file, usecols = ['slice', 'current_cluster'])
df = pd.read_csv(csv_file, usecols=['slice', 'current_cluster','pore_micro_precision', 'pore_micro_f1', 'gypsum_micro_precision', 'gypsum_micro_f1', 'celestite_micro_precision', 'celestite_micro_f1', 'bassanite_micro_precision', 'bassanite_micro_f1'])

# initialise label array
label = [0]*cluster_num

In [39]:
for i in range(cluster_num):
    one_cluster = df.loc[df['current_cluster'] == i]
    stats = one_cluster.mean()
    fscores = [stats[3], stats[5], stats[7], stats[9]]
    f_max = np.argmax(fscores)

    class_num = f_max + 1
    label[i] = class_num

In [40]:
label

[2, 3, 4, 2, 2, 1, 4, 4, 1, 2, 1, 2, 4, 2, 3, 2]

### Method 4:

Sololy based on precision

In [44]:
# Set the target segmentation results 
res_folder = 'large_clusters_rec'
seg_model = 'k-means'   # choose between 'gmm' and 'k-means'
seg_nd = '4d'   # choose between '3d' and '4d'
cluster_num = 16   # choose between 16, 32, 64, and 128

# Data path
base_folder = os.path.join(os.getcwd(), res_folder, seg_model, seg_nd, 'cluster_{}'.format(cluster_num))

# corresponding label csv file
csv_file = os.path.join(os.getcwd(), 'evaluation_rec_f1', '{}_{}_{}_f1.csv'.format(seg_model, seg_nd, cluster_num))

# read csv file
#df = pd.read_csv(csv_file, usecols = ['slice', 'current_cluster'])
df = pd.read_csv(csv_file, usecols=['slice', 'current_cluster','pore_micro_precision', 'pore_micro_f1', 'gypsum_micro_precision', 'gypsum_micro_f1', 'celestite_micro_precision', 'celestite_micro_f1', 'bassanite_micro_precision', 'bassanite_micro_f1'])

# initialise label array
label = [0]*cluster_num

In [45]:
for i in range(cluster_num):
    one_cluster = df.loc[df['current_cluster'] == i]
    stats = one_cluster.mean()
    precisions = [stats[2], stats[4], stats[6], stats[8]]
    p_max = np.argmax(precisions)
    class_num = p_max + 1
    label[i] = class_num

In [46]:
label

[2, 2, 4, 2, 2, 2, 4, 2, 1, 2, 1, 2, 2, 2, 3, 2]

# Inspection

In [36]:
one_cluster = df.loc[df['current_cluster'] == 15]

In [37]:
stats = one_cluster.mean()
stats

slice                        6.000000e+02
current_cluster              1.500000e+01
pore_micro_precision         1.569666e-02
pore_micro_f1                1.964436e-02
gypsum_micro_precision       8.040682e-01
gypsum_micro_f1              2.010560e-01
celestite_micro_precision    4.007143e-08
celestite_micro_f1           7.526645e-08
bassanite_micro_precision    1.558289e-01
bassanite_micro_f1           1.176972e-01
dtype: float64