# Check the annotations
- エラーとして　length is 0 的なエラーが出るので、全部の annotations が少なくとも１つあることを確認する
- ついでに train/test/val split をしてみる

```
fy2022
    / 0127 -> 80, 10, 10
        splits.csv  # id, train_test_val
        /train
            /images
            annotation.json
        /test
            /images
            annotation.json
        /val
            /images
            annotation.json
```

In [20]:
from __future__ import annotations
import json
from pathlib import Path
import pandas as pd
import mmcv
# split the unique ids into train/test/val
from sklearn.model_selection import train_test_split

In [6]:
root_path = Path("../data/bkt-orama-anocci-prod/caddi/fy2022q1-200/")
images_from_dir = root_path / "images"
annotations_file = root_path / "annots/caddi-fy2022q1-200.json"
exp_dir = root_path / "0127"
csv_file = exp_dir / "splits.csv"

def get_dir(phase="train") -> tuple[Path, Path]:
    """get image and annotation dir"""
    base = exp_dir / phase 
    images = base / "images"
    annots = base / "annotation.json"
    return images, annots

im_dir, an_file = get_dir()

In [38]:
annotations = mmcv.load(annotations_file)
annotations.keys()

dict_keys(['info', 'licenses', 'images', 'annotations', 'categories'])

In [39]:
df = pd.DataFrame(annotations['annotations'])
images = pd.DataFrame(annotations['images'])

In [40]:
df.head()

Unnamed: 0,id,image_id,segmentation,area,iscrowd,bbox,category_id
0,0,0,"[[1376.28662109375, 1914.3616943359375, 1376.2...",226886.30017,0,"[1376.28662109375, 1914.3616943359375, 466.211...",1
1,1,0,"[[1377.6597900390625, 2740.48291015625, 1377.6...",308129.399743,0,"[1377.6597900390625, 2740.48291015625, 466.751...",1
2,2,1,"[[2086.919189453125, 2363.779052734375, 2086.9...",171346.469903,0,"[2086.919189453125, 2363.779052734375, 424.510...",1
3,3,1,"[[980.4075927734376, 2367.258544921875, 980.40...",70514.600412,0,"[980.4075927734376, 2367.258544921875, 180.938...",1
4,4,2,"[[1258.90478515625, 2021.5059814453125, 1258.9...",198404.788943,0,"[1258.90478515625, 2021.5059814453125, 133.895...",1


In [16]:
unique_ids = df.image_id.unique()
print(len(unique_ids))

200


In [17]:
im_dir, an_file = get_dir()
im_dir, an_file

(PosixPath('../data/bkt-orama-anocci-prod/caddi/fy2022q1-200/0127/train/images'),
 PosixPath('../data/bkt-orama-anocci-prod/caddi/fy2022q1-200/0127/train/annotation.json'))

In [24]:
def make_splits(unique_ids, ratio=[80, 10, 10]) -> dict[str, list]:
    """split by ratio [train, val, test] """
    train_size = ratio[0] / sum(ratio)
    val_size = ratio[1]  / sum(ratio[1:])
    train, val_test = train_test_split(unique_ids, train_size=train_size, random_state=42)
    val, test = train_test_split(val_test, train_size=val_size, random_state=42)
    res = {
        "train": train,
        "val": val,
        "test": test,
    }
    print([f"{key}: {len(val)}" for key, val in res.items()])
    return res

In [55]:
splits = make_splits(unique_ids)

['train: 160', 'val: 20', 'test: 20']


In [56]:
for_df = []
for phase in ['train', 'test', 'val']:
    ids = splits[phase]

    # add df
    df_part = pd.DataFrame({'iamge_ids': ids}) 
    df_part['phase'] = phase
    for_df.append(df_part.copy())

df_ids = pd.concat(for_df)

In [57]:
csv_file.parent.mkdir(exist_ok=True)
df_ids.to_csv(csv_file, index=None)

In [58]:
import shutil
def copy_images(images, ids, im_dir):
    # get image path
    # subprocess popen cp
    print(len(ids))
    for image_id in ids:
        file_name = images.loc[images.id == image_id].file_name.to_list()[0]
        to_path = im_dir / file_name
        from_path = images_from_dir / file_name
        to_path.parent.mkdir(parents=True, exist_ok=True)
        shutil.copyfile(from_path, to_path)

# splits = make_splits(unique_ids)
for phase in ['train', 'test', 'val']:
    ids = splits[phase]
    im_dir, _ = get_dir(phase)
    copy_images(images, ids, im_dir)


160
20
20


In [59]:
images.head()

Unnamed: 0,id,license,file_name,coco_url,height,width,date_captured,flickr_url
0,0,0,target-001.png,,4963,3509,,
1,1,0,target-002.png,,4963,3509,,
2,2,0,target-003.png,,4963,3509,,
3,3,0,target-004.png,,4963,3509,,
4,4,0,target-005.png,,4963,3509,,


df_annotations_categories = pd.DataFrame(annotations['categories'])
df_annotations_categories

df_category_new = df_annotations_categories.iloc[1:2, :]
df_category_new

In [60]:
annotations.keys()

dict_keys(['info', 'licenses', 'images', 'annotations', 'categories'])

In [63]:
# splits = make_splits(unique_ids)
annotations_overwrite = annotations.copy()

for phase in ['train', 'test', 'val']:
    ids = splits[phase]
    _, an_dir = get_dir(phase)

    # get subset of annotations
    coco_df = df.loc[df.image_id.isin(ids)]
    coco_format = coco_df.to_dict('records')
    annotations_overwrite['annotations'] = coco_format

    # get subset of images
    images_subset = images.loc[images.id.isin(ids)]
    images_subset_coco = images_subset.to_dict('records')
    annotations_overwrite['images'] = images_subset_coco 

    # overwrite category
    # new_category = df_category_new.to_dict('records')
    # annotations_overwrite['categories'] = new_category
    mmcv.dump(annotations_overwrite, an_dir)
    print(pd.DataFrame(coco_format).image_id.nunique())

160
20
20


In [64]:
pd.DataFrame(coco_format).image_id.nunique()

20

In [67]:
images

Unnamed: 0,id,license,file_name,coco_url,height,width,date_captured,flickr_url
0,0,0,target-001.png,,4963,3509,,
1,1,0,target-002.png,,4963,3509,,
2,2,0,target-003.png,,4963,3509,,
3,3,0,target-004.png,,4963,3509,,
4,4,0,target-005.png,,4963,3509,,
...,...,...,...,...,...,...,...,...
195,195,0,target-576.png,,3509,4963,,
196,196,0,target-577.png,,3509,4963,,
197,197,0,target-578.png,,3509,4963,,
198,198,0,target-579.png,,3509,4963,,
