In [2]:
import os

labelPath = "dataset/labels"

files = list(filter(lambda x: x.endswith("txt"),os.listdir(labelPath)))

classes = ['black sheep', 'brown sheep', 'grey sheep', 'white sheep','black occluded sheep', 'brown occluded sheep', 'grey occluded sheep', 'white occluded sheep']

In [3]:


class_img = {}
class_count = {}
img_class = {}
img_class_count = {}
total_sheep = 0
total_imgs = 0

for file in files:
    total_imgs += 1
    with open(f"{labelPath}/{file}") as f:
        while (line := f.readline()) != "":

            total_sheep += 1

            c = classes[int(line[0])]
            i = file[:-4]

            class_img.setdefault(c, set()).add(i)

            class_count[c] = class_count.get(c, 0) + 1

            img_class.setdefault(i, set()).add(c)

            img_class_count.setdefault(i, {}).setdefault(c, 0)
            img_class_count[i][c] += 1

In [3]:
print("Instances of each class:")
class_count

Instances of each class:


{'black sheep': 2362,
 'grey sheep': 3515,
 'brown sheep': 1071,
 'white sheep': 12380,
 'white occluded sheep': 904,
 'grey occluded sheep': 149,
 'black occluded sheep': 40,
 'brown occluded sheep': 10}

In [4]:
print("Images containing class:")
for (c, imgs) in class_img.items():
    print(c, len(imgs))

Images containing class:
black sheep 1017
grey sheep 1150
brown sheep 445
white sheep 1739
white occluded sheep 516
grey occluded sheep 75
black occluded sheep 37
brown occluded sheep 7


In [5]:
print("Non-occluded sheep count:", class_count['black sheep'] + class_count['brown sheep'] + class_count['grey sheep'] + class_count['white sheep'])
print("Occluded sheep count:", class_count['black occluded sheep'] + class_count['brown occluded sheep'] + class_count['grey occluded sheep'] + class_count['white occluded sheep'])

Non-occluded sheep count: 19328
Occluded sheep count: 1103


In [5]:
total_sheep

20431

In [6]:
total_imgs

2125

In [7]:
f"Occulded sheep percentage: {100 * (class_count['black occluded sheep'] + class_count['brown occluded sheep'] + class_count['grey occluded sheep'] + class_count['white occluded sheep']) / total_sheep:.2f}%"

'Occulded sheep percentage: 5.40%'

In [8]:
f"Images containing occluded sheep percentage: {100 * (len(class_img['black occluded sheep']) + len(class_img['brown occluded sheep']) + len(class_img['grey occluded sheep']) + len(class_img['white occluded sheep'])) / total_imgs:.2f}%"

'Images containing occluded sheep percentage: 29.88%'

In [13]:
split = {"train": 0.7, "val": 0.15, "test": 0.15}

target = {}
for (set, proportion) in split.items():
    target[set] = {}
    for (clas, count) in class_count.items():
        target[set][clas] = count*proportion

In [37]:
import random
random.seed(69)

labels = files.copy()
random.shuffle(labels)

sets = {"train": [], "val": [], "test": []}

def calc_diff(assigned, target):
    diff = 0
    for clas in target.keys():
        if assigned[clas] >= target[clas]: continue
        diff += (target[clas] / max(assigned[clas], 0.1)) / class_count[clas]
    return diff

def calc_modified(assigned, clas_count):
    modified = assigned.copy()
    for clas, count in clas_count.items():
        modified[clas] = modified.get(clas, 0) + count
    return modified

assigned = {}
for (set, proportion) in split.items():
    assigned[set] = {}
    for (clas, count) in class_count.items():
        assigned[set][clas] = 0

for file in labels:
    clas_count = img_class_count.get(file[:-4])
    if clas_count is None: continue
    
    weights = {"train": 0, "val": 0, "test": 0}
    for set in weights.keys():
        start = calc_diff(assigned[set], target[set])
        modified = calc_modified(assigned[set], clas_count)
        end = calc_diff(modified, target[set])
        weights[set] = (start - end)
    
    winner = max(weights, key=weights.get)
    assigned[winner] = calc_modified(assigned[winner], clas_count)
    sets[winner].append(file)

In [None]:
path = "dataset"

for s, filelist in sets.items():
    with open(f"{path}/customsplit_{s}.txt", "w+") as output:
        for filename in filelist:
            output.write(f"./images/{filename[:-4]}.jpg\n")

In [38]:
target

{'train': {'black sheep': 1653.3999999999999,
  'grey sheep': 2460.5,
  'brown sheep': 749.6999999999999,
  'white sheep': 8666.0,
  'white occluded sheep': 632.8,
  'grey occluded sheep': 104.3,
  'black occluded sheep': 28.0,
  'brown occluded sheep': 7.0},
 'val': {'black sheep': 354.3,
  'grey sheep': 527.25,
  'brown sheep': 160.65,
  'white sheep': 1857.0,
  'white occluded sheep': 135.6,
  'grey occluded sheep': 22.349999999999998,
  'black occluded sheep': 6.0,
  'brown occluded sheep': 1.5},
 'test': {'black sheep': 354.3,
  'grey sheep': 527.25,
  'brown sheep': 160.65,
  'white sheep': 1857.0,
  'white occluded sheep': 135.6,
  'grey occluded sheep': 22.349999999999998,
  'black occluded sheep': 6.0,
  'brown occluded sheep': 1.5}}

In [39]:
assigned

{'train': {'black sheep': 1638,
  'grey sheep': 2427,
  'brown sheep': 744,
  'white sheep': 8585,
  'white occluded sheep': 627,
  'grey occluded sheep': 87,
  'black occluded sheep': 26,
  'brown occluded sheep': 6},
 'val': {'black sheep': 360,
  'grey sheep': 546,
  'brown sheep': 165,
  'white sheep': 1910,
  'white occluded sheep': 139,
  'grey occluded sheep': 31,
  'black occluded sheep': 6,
  'brown occluded sheep': 2},
 'test': {'black sheep': 364,
  'grey sheep': 542,
  'brown sheep': 162,
  'white sheep': 1885,
  'white occluded sheep': 138,
  'grey occluded sheep': 31,
  'black occluded sheep': 8,
  'brown occluded sheep': 2}}

In [120]:
sets = ["train", "val", "test"]
setPath = "dataset"

split_class_count = {}

for s in sets:
    split_class_count[s] = {}
    with open(f"{setPath}/autosplit_{s}.txt") as f:
        while (line := f.readline()) != "":
            i = line[9:-5]
            for (c, count) in img_class_count.get(i, {}).items():
                split_class_count[s][c] = split_class_count[s].get(c, 0) + count


In [121]:
list(split_class_count.items())

[('train',
  {'white sheep': 8886,
   'white occluded sheep': 627,
   'grey sheep': 2450,
   'black sheep': 1712,
   'black occluded sheep': 31,
   'grey occluded sheep': 81,
   'brown sheep': 758,
   'brown occluded sheep': 5}),
 ('val',
  {'white sheep': 1835,
   'white occluded sheep': 149,
   'grey sheep': 522,
   'black sheep': 330,
   'grey occluded sheep': 38,
   'brown sheep': 175,
   'black occluded sheep': 3,
   'brown occluded sheep': 1}),
 ('test',
  {'white sheep': 1659,
   'white occluded sheep': 128,
   'grey sheep': 543,
   'black sheep': 320,
   'black occluded sheep': 6,
   'grey occluded sheep': 30,
   'brown sheep': 138,
   'brown occluded sheep': 4})]

In [25]:
def class_proportion(c_count):
    tot = 0
    for (c, count) in c_count.items():
        tot += count
    class_proportions = {}
    for (c, count) in c_count.items():
        class_proportions[c] = count / tot
    return class_proportions

def print_cp(cp):
    for (c, prop) in cp.items():
        print(c, f"{100 * prop:.2f}%")

In [26]:
print("total")
print_cp(class_proportion(class_count))
print()

for (s, c_count) in split_class_count.items():
    print(s)
    print_cp(class_proportion(c_count))
    print()

total
black sheep 11.56%
grey sheep 17.20%
brown sheep 5.24%
white sheep 60.59%
white occluded sheep 4.42%
grey occluded sheep 0.73%
black occluded sheep 0.20%
brown occluded sheep 0.05%

train
white sheep 60.41%
white occluded sheep 4.36%
grey sheep 17.32%
black sheep 12.00%
black occluded sheep 0.22%
grey occluded sheep 0.52%
brown sheep 5.11%
brown occluded sheep 0.06%

val
white sheep 61.64%
white occluded sheep 4.36%
grey sheep 16.36%
black sheep 9.83%
black occluded sheep 0.20%
grey occluded sheep 0.97%
brown sheep 6.61%
brown occluded sheep 0.03%

test
white sheep 60.46%
white occluded sheep 4.76%
black sheep 11.20%
grey sheep 17.45%
grey occluded sheep 1.45%
brown sheep 4.54%
black occluded sheep 0.09%
brown occluded sheep 0.03%

