<a href="https://colab.research.google.com/github/Amir-D-Shadow/Google-Colab/blob/main/preprocess_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import pandas as pd
import json
import os
import numpy as np
from numba import jit
import random

In [None]:
def Kmean(bbox_hw,K=9,threshold=0.001,max_iterations=10000):

   """
   bbox_hw -- (m,2) : h -> 0 , w -> 1
   """

   m = bbox_hw.shape[0]

   #sample initial center
   sample_list = random.sample(range(m),K)

   #initialize anchors (K,2)
   anchors = np.zeros((K,2))
   
   for idx,pos in enumerate(sample_list):

     anchors[idx,0] = bbox_hw[pos,0]
     anchors[idx,1] = bbox_hw[pos,1]
   
   #calculate Kmean
   iteration_i = 0
   
   while iteration_i <= max_iterations:

      class_collections = [[] for i in range(K)]

      #classify bbox 
      for i in range(m):

         class_id = best_anchor(bbox_hw[i,:].reshape(1,2),anchors)

         class_collections[class_id].append(bbox_hw[i,:].tolist())

      #update anchor box
      sum_diff = 0
      
      for i in range(K):
         
         new_h,new_w = update_anchor_x(np.array(class_collections[i]),anchors[i,:].reshape(1,2))

         """
         #sum the changes for h and w of new anchor box 
         sum_diff = sum_diff + ((new_h-anchors[i,0])**2 + (new_w - anchors[i,1])**2)**(0.5)
         """
         #sum the changes for h and w of new anchor box - compare iou
         min_h = min(new_h,anchors[i,0])
         min_w = min(new_w,anchors[i,1])

         intersection = min_h * min_w
         union = new_h * new_w + anchors[i,0] * anchors[i,1] - intersection

         sum_diff = sum_diff + (1 - intersection / union)

         #update anchor box
         anchors[i,0] = new_h
         anchors[i,1] = new_w

      if sum_diff < threshold:

         return anchors

   print(sum_diff)
   
   return anchors
         

@jit(nopython=True)
def update_anchor_x(class_package,anchors_box):

  """
  class_package -- (n,2) : h -> 0 , w -> 1
  anchor_box -- (1,2) : h -> 0 , w -> 1
  """

  sum_w = 0
  sum_h = 0

  n = class_package.shape[0]

  for i in range(n):

    sum_h = sum_h + class_package[i,0]
    sum_w = sum_w + class_package[i,1]

  if n == 0:

    return anchors_box[0,0],anchors_box[0,1]

  mean_h = sum_h/n
  mean_w = sum_w/n

  return mean_h,mean_w


@jit(nopython=True)
def best_anchor(box,anchors):
   
  """
  box -- (1,2) :h -> 0 , w -> 1

  anchors -- (K,2)

  return class_idx
  """
  
  max_iou = 0
  max_index = 0

  K = anchors.shape[0]

  for i in range(K):

    min_h = np.minimum(box[0,0],anchors[i,0]).item()
    min_w = np.minimum(box[0,1],anchors[i,1]).item()

    intersection_area = min_h * min_w

    union_area = box[0,0] * box[0,1] + anchors[i,0] * anchors[i,1] - intersection_area

    cur_iou = intersection_area / union_area

    if cur_iou > max_iou:

      max_iou = cur_iou

      max_index = i

    
  return max_index

In [None]:
def get_class(path,path_class_map ,name="class_map.txt"):

   """
   MS COCO 2017 Dataset

   return dict
   """
   dataset = pd.read_csv(path)

   #get class map
   class_array = dataset.iloc[:,3]

   class_map = {}
   idx = 0
   
   for i in range(class_array.shape[0]):

      if not (class_array[i] in class_map.keys()):

         class_map[class_array[i]] = idx
         idx = idx + 1

   #save class map 
   with open(f"{path_class_map}/{name}","w") as file:
      
     file.write(json.dumps(class_map))

   return class_map

def get_bbox_info(path,path_pos,path_hw,name_pos="bbox_pos.txt",name_hw="bbox_hw.txt"):

   """
   MS COCO 2017 Dataset

   return numpy.ndarray,bbox_hw
   """

   dataset = pd.read_csv(path)

   """
   #find center
   dataset["center_x"] = (dataset["xmax"] + dataset["xmin"])/2
   dataset["center_y"] = (dataset["ymax"] + dataset["ymin"])/2
   """

   #get positional data
   bbox_pos = dataset.iloc[:,4:].to_numpy()

   #save bbox_pos
   with open(f"{path_pos}/{name_pos}","w") as file:

      file.write(json.dumps(bbox_pos.tolist()))

   #construct bbox_wh (m,2) : h -> 0 , w -> 1
   m = bbox_pos.shape[0]
   bbox_hw = np.zeros((m,2))

   for i in range(m):

      bbox_hw[i,0] = bbox_pos[i,3] - bbox_pos[i,1]
      bbox_hw[i,1] = bbox_pos[i,2] - bbox_pos[i,0]


   #save bbox_hw 
   with open(f"{path_hw}/{name_hw}","w") as file:

      file.write(json.dumps(bbox_hw.tolist()))


   return bbox_pos,bbox_hw


def get_pre_define_anchor_box(bbox_hw,save_path,name="anchors.txt"):

   """
   bbox_hw -- numpy.ndarray (m,2)
   """

   anchors = Kmean(bbox_hw)

   with open(f"{save_path}/{name}","w") as file:

      file.write(json.dumps(anchors.tolist()))

   
   return anchors

In [None]:
print(os.getcwd())

/content


In [None]:
#class map
class_map = get_class("/content/gdrive/MyDrive/valid_set.csv","/content/gdrive/MyDrive")

file = open("/content/gdrive/MyDrive/class_map.txt")
a = json.load(file)

file.close()

#bbox
bbox_pos,bbox_hw = get_bbox_info("/content/gdrive/MyDrive/valid_set.csv","/content/gdrive/MyDrive","/content/gdrive/MyDrive")

file = open("/content/gdrive/MyDrive/bbox_pos.txt")
b = json.load(file)

file = open("/content/gdrive/MyDrive/bbox_hw.txt")
c = json.load(file)

file.close()

anchors = get_pre_define_anchor_box(bbox_hw,"/content/gdrive/MyDrive")