In [None]:
from sklearn.cluster import KMeans
import glob
from sklearn.metrics.pairwise import euclidean_distances
import csv
import os
from PIL import Image
from itertools import product
import numpy as np

In [None]:
PATCH_IMAGE_SIZE = 256//64
SOURCE_DIR = '/kaggle/input/tomato-short-dataset-5000/test'
OUTPUT_FILE = '/kaggle/working/test_weights.csv'

In [None]:
header = ['filename', 'class', 'weight', 'full_path']
with open(OUTPUT_FILE, 'w', newline='') as file:
  writer = csv.writer(file)
  writer.writerow(header)

In [None]:
def tile(filename, dir_in, d):
    patches = []
    name, ext = os.path.splitext(filename)
    try:
        img = Image.open(os.path.join(dir_in, filename))
    except:
        print('Could not open image: ', filename)
        return
    w, h = img.size

    grid = product(range(0, h-h % d, d), range(0, w-w % d, d))
    for i, j in grid:
        box = (j, i, j+d, i+d)
        patches.append(img.crop(box))
    return patches

In [None]:
def extractFetures(files, subfolder):
    features = []
    labels = []
    file_count = 0
    for file in files:
        file_count += 1

        # create patches
        m_patches = tile(file, subfolder, PATCH_IMAGE_SIZE)
        # if error occurred in file reading, skipping that file.
        if(not m_patches): continue

        # convert to numpy array
        m_patches = [np.array(patch) for patch in m_patches]

        # extract features
        m_features = [patch.flatten() for patch in m_patches]

        # we have 16 features per image, with 16 same labels
        for f in m_features:
            features.append(f)
            labels.append(os.path.basename(file))
    return features, labels, file_count

In [None]:
def getMeanDistance(kmean, features):
  mean_distances = []
  # Get cluster centroids
  centroids = kmean.cluster_centers_

  # Calculate mean distance for each cluster
  for i in range(3):
    points = []
    # Select points in cluster
    for j in range(kmean.labels_.size):
      l = kmean.labels_[j]
      if l == i:
        points.append(features[j])
    
    # Calculate distances between points and centroid
    print("Calculating for cluster: " + str(i))
    distances = euclidean_distances(points, centroids[i].reshape(1, -1))
    
    # Calculate mean distance
    mean_distance = np.mean(distances)
    
    # Add mean distance to list
    mean_distances.append(mean_distance)

  return mean_distances

In [None]:
def saveToCSV(files, weights):
    idx = 0
    with open(OUTPUT_FILE, 'a', newline='') as file:
      writer = csv.writer(file)
      for f in files:
        label = os.path.basename(f)
        dirname = os.path.dirname(f)
        classname = dirname.split("/")[-1]
        writer.writerow([label, classname, weights[idx], dirname])
        idx += 1  

In [None]:
subfolders = [f.path for f in os.scandir(SOURCE_DIR) if f.is_dir()]
for subfolder in subfolders:
    files = os.listdir(subfolder)
    print("-----------------------------")
    print("Processing folder: ", subfolder)
    
    features, labels, file_count = extractFetures(files, subfolder)

    print("\nTotal Features: ", len(features))
    print("Total Labels: ", len(labels))
    print("Total File fetched: ", file_count)

    print("Creating Clusters...")
    # Creating Clusters
    k = 3
    kmean = KMeans(k, random_state=40)
    kmean.fit(features)
    clusters_labels = kmean.predict(features)
    print("Clusters created.")
    files = glob.glob(subfolder + '/*.JPG')
    print("Total Files found: ", len(files))

    files_clusters = dict()
    for f in files:
        filename = os.path.basename(f)
        files_clusters[filename] = dict()
        files_clusters[filename]['clusters'] = [0 for i in range(3)]

    cluster_count = [0 for i in range(3)]

    for i in range(len(labels)):
        label = labels[i]
        cluster = clusters_labels[i]
        if label in files_clusters:
            files_clusters[label]['clusters'][cluster] += 1
        cluster_count[cluster] += 1

    print("\nCluster Size: ", cluster_count)
    file1Cluster = files_clusters[list(files_clusters.keys())[0]]
    print("File 1 Cluster: ", file1Cluster)
    print("Total features per image:", sum(file1Cluster['clusters']))

    print("Calculating mean distance...")

    # Initialize list to store mean distances for each cluster
    mean_distances = getMeanDistance(kmean, features)
    print("Mean Distance is: ", mean_distances)

    print("Calculating weight...")
    # weights
    # c = k * d/n
    ce = [0 for i in range(3)]
    for i in range(3):
        ce[i] = mean_distances[i]/cluster_count[i]
    # normalize
    cluster_coefficient = [float(i)/sum(ce) for i in ce]
    print("Cluster_coefficients: ", cluster_coefficient)

    c1 = cluster_coefficient[0]
    c2 = cluster_coefficient[1]
    c3 = cluster_coefficient[2]
    weights = []
    for f in files:
        label = os.path.basename(f)
        fc = files_clusters[label]['clusters']

        w = ( c1*fc[0] + c2*fc[1] + c3*fc[2] ) / (fc[0] + fc[1] + fc[2])
        weights.append(w)
    

    # saving weights
    print("Saving weights...")
    saveToCSV(files, weights)
    print("Weights saved.\n------------------------------\n\n")
    