In [None]:
!pip install bounding-box
!pip install natsort

In [None]:
import pandas as pd 
import glob
from pprint import pprint
import os
from tqdm import tqdm
from bounding_box import bounding_box as bb
import os
from IPython.display import Image, display
import random
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import shutil
from natsort import natsorted

In [None]:
annotations_file_path = "/kaggle/input/cs406-dataset/annotation/annotation"
annotations_file_part = "/kaggle/working/annotations"
data_split_path = '/kaggle/input/cs406-dataset/data_split.csv'
part_folders = [f"/kaggle/input/cs406-dataset/image/part_{i}/part_{i}" for i in range(1, 8, 1)]
image_output_dir = "/kaggle/working/dataset/images"
anno_output_dir = "/kaggle/working/dataset/labels"
os.makedirs(image_output_dir, exist_ok= True)
os.makedirs(anno_output_dir, exist_ok= True)
os.makedirs(os.path.join("/kaggle/working/", 'annotations'), exist_ok=True)

# Load file

In [None]:
csv_files = list(glob.iglob("{}/*.csv".format(annotations_file_path)))
data_split = pd.read_csv(data_split_path)

# Mapping class

In [None]:
for csv_file in tqdm(sorted(csv_files)):
    data = pd.read_csv(csv_file)
    if "label" in data.columns:
        data["label"] = data["label"].apply(lambda x: "NoHelmet" if "NoHelmet" in x else "Helmet")
        output_file = os.path.join("/kaggle/working/annotations", os.path.basename(csv_file))
        data.to_csv(output_file, index=False)

In [None]:
bago_highway_1 = pd.read_csv("/kaggle/input/cs406-dataset/annotation/annotation/Bago_highway_1.csv")
images = glob.glob("/kaggle/input/cs406-dataset/image/part_1/part_1/Bago_highway_1/*")
images.sort()
sorted_df = bago_highway_1.sort_values('frame_id')

In [None]:
bago_highway_1.columns

In [None]:
data_split.head()

# Draw bounding-box

In [None]:
for i in images:
    image_path = i.split("/")
    id = int(image_path[-1].split(".")[0])
    rows = bago_highway_1[bago_highway_1['frame_id'] == id]
    image = mpimg.imread(i)
    image_copy = image.copy()
    
    for index, row in rows.iterrows():
        bb.add(image_copy, row['x'], row['y'], row['x'] + row['w'], row['y'] + row['h'], row['label'], "green")
    
    plt.imshow(image_copy)
    plt.axis('off')  
    plt.show() 
    break

# Split data

In [None]:
def find_video(video_id, part_folders):
    for part_folder in part_folders:
        video_path = os.path.join(part_folder, video_id)
        if os.path.exists(video_path):
            return video_path
    return None

In [None]:
find_video("Bago_highway_1", part_folders)

In [None]:
file = []
for img in os.listdir(find_video("Bago_highway_1", part_folders)):

    # print(os.path.splitext(img)[0])\
    frame_id = os.path.splitext(img)[0]

    file.append((f"Bago_highway_1_{frame_id}"))

In [None]:
len(file)

In [None]:
annotation1 = os.path.join(annotations_file_part, "Bago_highway_1.csv")
annot1 = pd.read_csv(annotation1)

In [None]:
annot1[annot1['frame_id'] == int(1)]

In [None]:
image_width = 1920
image_height = 1080

# format yolo (label x_center, y_center width height)

In [None]:
def save_yolo_annotation(annotation, output_annotation_path):
    with open(output_annotation_path, 'w') as f:
        for index, row in annotation.iterrows():
            track_id = row['track_id']
            
            x_center = (row['x'] + row['w'] / 2) / image_width
            y_center = (row['y'] + row['h'] / 2) / image_height
            width = row['w'] / image_width
            height = row['h'] / image_height
            
            label_num = 1 if row['label'] == "Helmet" else 0

            f.write(f"{label_num} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}\n")


In [None]:
video_files = []
for folder in part_folders:
    if os.path.isdir(folder):
        videos_in_folder = [video for video in os.listdir(folder)]
        video_files.extend(videos_in_folder)
        
video_files = natsorted(video_files)
# print("Sorted Video Files:", video_files)

In [None]:
for index, row in tqdm(data_split.iterrows(), total=len(data_split), desc="Processing videos"):
    video_id = row['video_id']
    dataset_type = row['Set']  
    if dataset_type == "training":
        dataset_type = "train"
    elif dataset_type == "validation":
        dataset_type = "val"
        
    video_path = find_video(video_id, part_folders)
    print("Video Path:", video_path)

    if video_path:
        video_image_dir = os.path.join(image_output_dir, dataset_type, f"{video_id}")
        video_annotation_dir = os.path.join(anno_output_dir, dataset_type, f"{video_id}")
        os.makedirs(video_image_dir, exist_ok=True)
        os.makedirs(video_annotation_dir, exist_ok=True)

        image_files = natsorted([img for img in os.listdir(video_path) if img.endswith('.jpg')])

        annotation_file = os.path.join(annotations_file_part, f'{video_id}.csv')
        annotations = pd.read_csv(annotation_file).sort_values(by='frame_id')

        for img in tqdm(image_files, desc=f"Processing {video_id}", leave=False):
            frame_id = os.path.splitext(img)[0]
            src_image_path = os.path.join(video_path, img) 
            dst_image_path = os.path.join(video_image_dir, f"{frame_id}.jpg")

            shutil.copy(src_image_path, dst_image_path)

            frame_annotations = annotations[annotations['frame_id'] == int(frame_id)]
            
            output_annotation_path = os.path.join(video_annotation_dir, f"{frame_id}.txt")
            save_yolo_annotation(frame_annotations, output_annotation_path)

# clean output

In [None]:
# def clean_all_output(output_path):
#     if os.path.exists(output_path):
#         for item in os.listdir(output_path):
#             item_path = os.path.join(output_path, item)
#             if os.path.isdir(item_path):
#                 shutil.rmtree(item_path)
#             else:
#                 os.remove(item_path)
#         print(f"Tất cả nội dung trong '{output_path}' đã được xóa.")
#     else:
#         print(f"Thư mục '{output_path}' không tồn tại.")

# output_path = "/kaggle/working/" 
# clean_all_output(output_path)