### <span style="color:yellow"> Import Libraries

In [1]:
from pathlib import Path
import pandas as pd
import json
from typing import List
import yaml
import numpy as np

### <span style="color:yellow"> Work directory must be TACO
### ` If not `
> ```bash
> cd TACO/
> ```

In [2]:
pwd

'/notebooks/TACO'

### <span style="color:yellow">Download TACO datasets 
> Use terminal and for all instructions read REAME.md
> ```bash
> python download.py
> ```

### <span style="color:yellow">When the datasets donwloaded 

In [3]:
cd detector/

/notebooks/TACO/detector


In [4]:
!python split_dataset.py --dataset_dir ../data

`` The annotations has splited to 30 batchs of json files.``
> * 10 Train
> * 10 Test
> * 10 val

` Return to Taco directory `

In [12]:
cd ..

/notebooks


`` Assign a variable for taco datasets ``

In [17]:
DATA_TACO = Path.cwd()/"TACO"/"data"
DATA_TACO

PosixPath('/notebooks/TACO/data')

In [24]:
ls

[0m[01;34mTACO[0m/  [01;34mdatasets[0m/  [01;34myolov7[0m/


### Build dataset mapping directories of yolov7
1. > Create images directories of train, test and val directories, and the file json > annotation for each split directory from Taco files json annotations
2.  > Create labels directories of train, test and val directories, does labels contains txt files


### Function to build images directories

In [28]:
def build_split_images_and_json_file(path_dir_annotations:Path, list_dir_split:List[str]):
    """Build a mapping directeries of yalov7. 
    This funtion create images directories of train, test and val directeries.
    Also it create the annotations json files of splits directories

    Args:
        path_dir_annotations (Path): It's a path of annotations directory (e.g TACO/data/annotations)
        list_dir_split (List[str]): split's names list (e.g ["train","test","val"])
    """
    # project directory
    project_dir = Path.cwd()
    print(project_dir)
    # create a new data directory contains the splits directories train, test and val
    datasets = project_dir/"datasets"
    datasets.mkdir(exist_ok=True)
    # Get a list all of json files annatations
    file_json = [f for f in path_dir_annotations.parent.iterdir() if f.is_file() and str(f).endswith(".json")]
    # Get a list all of images batchs directories
    dir_imgs = [d for d in path_dir_annotations.iterdir() if d.is_dir()]
    # Get a parent of images directories
    path_dir_image_batch=dir_imgs[0].parent
    print(path_dir_image_batch)
    # Iterate over the list that contains the names of created folders
    for annot in list_dir_split:
        print(annot)
        #Dictionary of the new specific annotations json file
        dict_js = {}
        # Name of split folder (e.g "datasets/train")
        print(datasets)
        dir_split = datasets/annot
        dir_split.mkdir(exist_ok=True)
        images_dir = dir_split/"images"
        images_dir.mkdir(exist_ok=True)
        # Get a list all of json files annotations of specific split directory
        file_split = [f for f in file_json if f.is_file() and str(f).__contains__(annot)]

        # Iterate over annotations json files list
        for fic in file_split:
            with open(fic,"r", encoding="utf-8") as f:
                json_load = json.load(f)
            # update dictionary with the contains annotations json files 
            dict_js.update(json_load)
        for el in dict_js["images"]:
            file_name = el["file_name"]
            path_img = path_dir_image_batch/file_name
            
            file_out = "_".join(file_name.split("/"))
            output_file = dir_split/"images"/file_out
            el["file_name"]=f"{annot}/images/{file_out}"
            if not output_file.exists():
                output_file.write_bytes(path_img.read_bytes())
            
        with open(dir_split/f"annotations-{annot}.json","w", encoding="utf-8") as f:
            json.dump(dict_js,f,indent=4)

             

### Run Function build_split_images_and_json_file

In [18]:
# Assign a variable directories' split names
split_list_dir = ["train","test","val"]
# Assign variable Annotations directory
annotations = DATA_TACO/"annotations"

In [29]:
# Run Function
build_split_images_and_json_file(annotations,split_list_dir)

/notebooks
/notebooks/TACO/data/annotations
train
/notebooks/datasets
test
/notebooks/datasets
val
/notebooks/datasets


### check if the image transfer went well

In [30]:
# Function checking count of each images directories as well
def check_count_imgs_in_split_dir(list_path_images_split_dir:List[Path]):
    for img in list_path_images_split_dir:
        name_dir = img.parent.name
        list_path = [f for f in img.iterdir() if f.is_file()]
        print(f"count images of {name_dir} folder is : {len(list_path)}")


In [31]:
datasets = DATA_TACO.parent.parent/"datasets"
train_images_dir = datasets/"train"/"images"
test_images_dir = datasets/"test"/"images"
val_images_dir = datasets/"val"/"images"
list_split_pathImgs = [train_images_dir,test_images_dir,val_images_dir]

In [32]:
check_count_imgs_in_split_dir(list_split_pathImgs)

count images of train folder is : 1200
count images of test folder is : 150
count images of val folder is : 150


## Create the labels directory for each split directory 
In labels directory we are creating txt files for each image segmentations.
These files contains the coordinates of the frames of each waste in the image

In [36]:
def normalise_bbox(df):
        bbox = df["bbox"].values[0]
        
        W = df["width"].values[0]
        H = df["height"].values[0]
        x = bbox[0] 
        y = bbox[1] 
        w = bbox[2]
        h = bbox[3]
        X = np.round((x + w/2)/W,6)
        Y = np.round((y + h/2)/H,6)
        wn = np.round(w/W)
        hn = np.round(h/H,6)
        return [X,Y,wn, hn]

In [35]:
def normalize_segmentation(seg_values):
    seg_value = seg_values['segmentation'].values[0][0]
    width = seg_values["width"].values
    height = seg_values["height"].values
    
    list_seg = []
    
    for i, v in enumerate(seg_value):
        if i % 2:
            
            y = np.round(float(v)/float(height[0]),6)
            
            list_seg.append(y)
        else:
            
            #print("width ===>",type(width[0]))
            x = np.round(float(v)/float(width[0]),6)
            list_seg.append(x)
    return list_seg

In [11]:
with open(r"C:\repositoryProf\Project_E2\TACO\data\annotations_0_train.json", "r", encoding="utf-8") as f:
            annotates = json.load(f)

In [16]:
 #Create DataFrame from data json file
images = pd.DataFrame(annotates["images"], columns=["id","file_name","width","height"])
images.rename(columns={"id":"image_id"}, inplace=True)
annot = pd.DataFrame(annotates["annotations"], columns=["id","image_id","category_id","segmentation","bbox"])
cat = pd.DataFrame(annotates["categories"]).rename(columns={"id":"category_id"}).sort_values(by="category_id", ascending=True)
df = annot.merge(images)
df = df.merge(cat)
cat_index = cat.supercategory.unique()
df.supercategory_id = np.nan
for i,v in enumerate(cat_index):
    df.loc[df['supercategory']==v, "supercategory_id"]= i

In [17]:
df.columns

Index(['id', 'image_id', 'category_id', 'segmentation', 'bbox', 'file_name',
       'width', 'height', 'supercategory', 'name', 'supercategory_id'],
      dtype='object')

In [13]:
df["width"].values

array([1537, 3264, 1824, ..., 2268, 2448, 2448], dtype=int64)

In [19]:
from pprint import pprint

In [34]:
# df_h = df[df["image_id"]==1]

In [28]:
# filter_df = df[df["image_id"]==1][["image_id","category_id","segmentation","bbox","file_name","width","height","supercategory_id"]]
# filter_df.iloc[0:1,:]["segmentation"]

0    [[928.0, 1876.0, 938.0, 1856.0, 968.0, 1826.0,...
Name: segmentation, dtype: object

In [None]:
# uniq = df["image_id"].unique()
# for un in uniq:
#     filter_df = df[df["image_id"]==un]
#     pprint(filter_df)


In [37]:
def build_labels_txt_with_segmentations(data_path_dir:Path,names_dir:List[str]):
    """Create a folders for each split directory (e.g train, test, val), in each folder we create labels folder
    These labels folders contains txt files 

    Args:
        data_path_dir (Path): Path of dataset directory for all of mapping data for yolov7
        names_dir (List[str]): The list names of each splits directories (e.g ["train","test","val"])
    """
    #Iterate over names split diretories list
    for name in names_dir:
        path_annotations_dir = datasets/name

        #Get transformed annotations json file in this directory
        path_annotations = [f for f in path_annotations_dir.iterdir() if str(f).endswith(".json")][0]
        # Create labels directory
        labelsTrain_path = data_path_dir/name/"labels"
        labelsTrain_path_supcat = data_path_dir/name/"labels_supCat"
        labelsTrain_path_supcat.mkdir(exist_ok=True)
        labelsTrain_path.mkdir(exist_ok=True, parents=True)
        #Get data from annotations json file
        with open(path_annotations, "r", encoding="utf-8") as f:
            annotates = json.load(f)
        #Create DataFrame from data json file
        images = pd.DataFrame(annotates["images"], columns=["id","file_name","width","height"])
        images.rename(columns={"id":"image_id"}, inplace=True)
        annot = pd.DataFrame(annotates["annotations"], columns=["id","image_id","category_id","segmentation","bbox"])
        cat = pd.DataFrame(annotates["categories"]).rename(columns={"id":"category_id"}).sort_values(by="category_id", ascending=True)
        df = annot.merge(images)
        df = df.merge(cat)
    
        cat_index = cat.supercategory.unique()
        df.supercategory_id = np.nan

        for i,v in enumerate(cat_index):
            df.loc[df['supercategory']==v, "supercategory_id"]= i
        # Loop to create  labels txt files
        for img in df["image_id"].unique():        
            seg = df[df["image_id"]==img]
            length = len(seg.index)
            i = 0
            name_file = Path(seg['file_name'].values[0])
            labelsTrain_supcat = labelsTrain_path_supcat/f"{name_file.stem}.txt"
            path_txt = labelsTrain_path/f"{name_file.stem}.txt"
            for j in range(length):
                labels_seg = seg.iloc[i:j+1,:]
                seg_value = labels_seg['segmentation'].values[0][0]
                seg_zn = normalize_segmentation(labels_seg)
                #print("values ===>",seg_value)
                coord_seg = ",".join([str(x)for x in seg_zn]).replace(","," ")
                lab_seg = f"{labels_seg['category_id'].values[0]} {coord_seg}\n"
                #print(labels_seg["bbox"])
                bbox = labels_seg["bbox"].values[0]
                bbox_nz = normalise_bbox(labels_seg)

                #print(bbox_nz)
                #print(labels_seg["width"])
                coord_bbox = ",".join([str(x) for x in bbox_nz]).replace(","," ")
                lab_bbox = f"{int(labels_seg['supercategory_id'].values[0])} {coord_bbox}\n"
                i+=1
                with open(labelsTrain_supcat,"a", encoding="utf-8") as f:
                    f.write(lab_bbox)

                with open(path_txt,"a", encoding="utf-8") as f:
                    f.write(lab_seg)
    print("==================finished===========================")

In [38]:
build_labels_txt_with_segmentations(datasets,["train","test","val"])



In [28]:
with open(r"C:\repositoryProf\Project_E2\TACO\data\annotations_0_test.json", "r", encoding="utf-8") as f:
    get = json.load(f)

In [29]:
an = get["annotations"]

In [41]:
Path.cwd()

PosixPath('/notebooks')

In [46]:
def create_yaml_taco_data():
    cur_dir = Path.cwd()
    taco_data = cur_dir/"TACO"/"data"
    yolov7_dir = cur_dir/"yolov7"
    yolov7_data = yolov7_dir/"data"
    yaml_data = yolov7_data/"taco.yaml"
    yaml_data_sup = yolov7_data/"taco_sup.yaml"
    global_annotations_path = [f for f in taco_data.iterdir() if str(f).endswith("annotations.json")][0]
    with open(global_annotations_path, "r", encoding="utf-8") as f:
        global_annotations = json.load(f)
        categories = pd.DataFrame(global_annotations["categories"]).rename(columns={"id":"category_id"}).sort_values(by="category_id")
        annotations_global = pd.DataFrame(global_annotations["annotations"],columns=["category_id"]).merge(categories).sort_values(by="category_id")
        
    classes = list(categories["name"].unique())
    nc = len(categories["name"].unique())
    nc_sup = len(categories["supercategory"].unique())
    classes_sup = list(categories["supercategory"].unique())
    dict_taco_yaml={
        "train": "../datasets/train/images",
        "val": "../datasets/val/images",
        "test": "../datasets/test/images",
        
        "nc": nc,
        "names":classes,    
    }
    dict_taco_sup_yaml={
        "train": "../datasets/train/images",
        "val": "../datasets/val/images",
        "test": "../datasets/test/images",
        
        "nc": nc_sup,
        "names":classes_sup,    
    }
    with open(yaml_data_sup, "w", encoding="utf-8") as f:
        yaml.dump(dict_taco_sup_yaml,f, indent=4)
    with open(yaml_data, "w", encoding="utf-8") as f:
        yaml.dump(dict_taco_yaml,f, indent=4)
    

In [48]:
create_yaml_taco_data()