# h

- clome the mmdetection repo: https://github.com/open-mmlab/mmdetection
- pip install -e . in the mmdetection directory
- Change the file: mmdetection/configs/htc/htc_r50_fpn_1x_artifact.py


python tools/train.py configs/htc/htc_r50_fpn_1x_artifact.py --cfg-options device=mps


# Data

The data came from Roboflow. You can find the dataset here: https://universe.roboflow.com/subjective/deepfake-detection-kukoh

Since we needed more data for training, we left the test set with only 10 images and moved the rest to the training and validation sets. Also, Roboflow uses a different ID format than the one used by MMDetection. Thus, we had to change the IDs to be able to use the data.

# Steps followed to prepare the data for training

### Reorganizing the data

The number of images in each set printed below are actually the numbers after the redistribution of the data. For the actual numbers, please refer to the Roboflow dataset.

In [3]:
from pycocotools.coco import COCO

# Paths to your COCO annotation files (data_artifacts is the folder with our data)
train_anno = '../data_artifacts/train/_annotations.coco.json'
valid_anno = '../data_artifacts/valid/_annotations.coco.json'
test_anno = '../data_artifacts/test/_annotations.coco.json'

# Load each dataset
coco_train = COCO(train_anno)
coco_valid = COCO(valid_anno)
coco_test = COCO(test_anno)

# Print number of images
print(f"Train images: {len(coco_train.getImgIds())}")
print(f"Validation images: {len(coco_valid.getImgIds())}")
print(f"Test images: {len(coco_test.getImgIds())}")

loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Train images: 69
Validation images: 25
Test images: 10


### Fixing the annotation IDs

In [4]:
import json

def fix_annotation_ids(coco_json_path, save_path=None):
    with open(coco_json_path, 'r') as f:
        data = json.load(f)

    for idx, ann in enumerate(data['annotations']):
        ann['id'] = idx + 1  # reassign unique IDs

    if save_path is None:
        save_path = coco_json_path  # overwrite in place

    with open(save_path, 'w') as f:
        json.dump(data, f)

    print(f'Fixed annotation IDs in: {save_path}')


# Run for all sets
fix_annotation_ids('/Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/train/_annotations.coco.json')
fix_annotation_ids('/Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/valid/_annotations.coco.json')
fix_annotation_ids('/Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/test/_annotations.coco.json')

Fixed annotation IDs in: /Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/train/_annotations.coco.json
Fixed annotation IDs in: /Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/valid/_annotations.coco.json
Fixed annotation IDs in: /Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/test/_annotations.coco.json


### Checking if all images are present

In [5]:
import os
import json

def check_image_files(annotation_path, image_folder):
    with open(annotation_path, 'r') as f:
        coco_data = json.load(f)

    image_filenames = [img['file_name'] for img in coco_data['images']]
    missing = []

    for filename in image_filenames:
        image_path = os.path.join(image_folder, filename)
        if not os.path.isfile(image_path):
            missing.append(filename)

    if missing:
        print(f"{len(missing)} missing image(s) in {image_folder}:")
        for m in missing:
            print(f"  - {m}")
    else:
        print(f"All {len(image_filenames)} images in {image_folder} are present.")


# Run on all sets
check_image_files(
    '/Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/train/_annotations.coco.json',
    '/Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/train'
)

check_image_files(
    '/Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/valid/_annotations.coco.json',
    '/Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/valid'
)

check_image_files(
    '/Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/test/_annotations.coco.json',
    '/Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/test'
)


All 69 images in /Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/train are present.
All 25 images in /Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/valid are present.
All 10 images in /Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/test are present.


In [6]:
import os
from pathlib import Path
from pycocotools.coco import COCO

# Load train annotation
train_anno = '/Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/train/_annotations.coco.json'
img_dir = '/Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/train'
coco = COCO(train_anno)

# All file names in COCO JSON
json_files = set(img['file_name'] for img in coco.dataset['images'])

# All actual image files (case insensitive)
actual_files = set(p.name for p in Path(img_dir).glob("*.*"))

# Compare
missing = json_files - actual_files
extra = actual_files - json_files

print(f"Missing {len(missing)} files from disk (referenced in JSON):")
for m in sorted(missing): print(f"  - {m}")

print(f"\n Extra {len(extra)} files on disk (not referenced in JSON):")
for e in sorted(extra): print(f"  - {e}")


loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Missing 0 files from disk (referenced in JSON):

 Extra 37 files on disk (not referenced in JSON):
  - 24032_png.rf.0319fb724c7d50e4629d79a71e67dc7d.jpg
  - 24032_png.rf.3267f9a6117bbf9afc049efa4679f6b2.jpg
  - 24040_png.rf.483f4b5ae4a734a7be385dd487140d01.jpg
  - 24040_png.rf.a0550fe9b64d1b344f1f6de0d1da03b7.jpg
  - 28005_png.rf.3f09b819455445ee7045b9c1040bb927.jpg
  - 28005_png.rf.977ca9536c50565c6cabb4520b34e89f.jpg
  - 28251_png.rf.10a4252d915242feac0228adc8246d4d.jpg
  - 28251_png.rf.54a72de28eab545eeca37991be81d5a0.jpg
  - 28251_png.rf.71cde4b376ef3e6463359709bc5f5aac.jpg
  - 28263_png.rf.02305b1df36a237689765ba8692c82b1.jpg
  - 28263_png.rf.303fad836a6b5ded0f588645d15b160c.jpg
  - 28263_png.rf.91ad62d10de334b9a4d8bda9f5404f12.jpg
  - 28375_png.rf.437e3cabb934858f6209bce220b92ea5.jpg
  - 28376_png.rf.0d3a8141a17cbed8b312866575cc7ba5.jpg
  - 28376_png.rf.3e5b6c6677b8224ec746beee0f6d7e98.jpg
  - 2837

### Splitting the data

In [7]:
import json
import random
from pathlib import Path

# Paths
base_dir = Path('/Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/')
train_path = base_dir / 'train/_annotations.coco.json'
valid_path = base_dir / 'valid/_annotations.coco.json'
test_path  = base_dir / 'test/_annotations.coco.json'

# Load annotation files
with open(train_path) as f:
    train_data = json.load(f)
with open(valid_path) as f:
    valid_data = json.load(f)
with open(test_path) as f:
    test_data = json.load(f)

# Shuffle and split test images
random.seed(42)
random.shuffle(test_data['images'])

test_images = test_data['images'][:10]
extra_images = test_data['images'][10:]

# Get image ids to move
extra_ids = {img['id'] for img in extra_images}
test_ids = {img['id'] for img in test_images}

# Separate corresponding annotations
extra_annotations = [ann for ann in test_data['annotations'] if ann['image_id'] in extra_ids]
test_annotations  = [ann for ann in test_data['annotations'] if ann['image_id'] in test_ids]

# Split the extra images between train and val
extra_val = extra_images[:6]
extra_train = extra_images[6:]

extra_val_ids = {img['id'] for img in extra_val}
extra_train_ids = {img['id'] for img in extra_train}

extra_val_annotations = [ann for ann in extra_annotations if ann['image_id'] in extra_val_ids]
extra_train_annotations = [ann for ann in extra_annotations if ann['image_id'] in extra_train_ids]

# Update original files
train_data['images'].extend(extra_train)
train_data['annotations'].extend(extra_train_annotations)

valid_data['images'].extend(extra_val)
valid_data['annotations'].extend(extra_val_annotations)

test_data['images'] = test_images
test_data['annotations'] = test_annotations

# Save back to disk
with open(train_path, 'w') as f:
    json.dump(train_data, f)

with open(valid_path, 'w') as f:
    json.dump(valid_data, f)

with open(test_path, 'w') as f:
    json.dump(test_data, f)

print("Split complete. Test now has 10 images. Others redistributed to train and valid.")

Split complete. Test now has 10 images. Others redistributed to train and valid.


### Checking missing files

In [8]:
import shutil

# Move extra_train images from test/ to train/
for img in extra_train:
    img_file = img['file_name']
    src = base_dir / 'test' / img_file
    dst = base_dir / 'train' / img_file
    if src.exists():
        shutil.move(str(src), str(dst))
    else:
        print(f"Missing file: {src}")

# Move extra_val images from test/ to valid/
for img in extra_val:
    img_file = img['file_name']
    src = base_dir / 'test' / img_file
    dst = base_dir / 'valid' / img_file
    if src.exists():
        shutil.move(str(src), str(dst))
    else:
        print(f"Missing file: {src}")

# Keep only test_images in test/ folder
test_img_filenames = {img['file_name'] for img in test_images}
for img_file in (base_dir / 'test').glob("*.jpg"):
    if img_file.name not in test_img_filenames:
        img_file.unlink()  # Remove unneeded file


### Normalizing the categories

In [9]:
import json

def normalize_categories(coco_path):
    with open(coco_path, 'r') as f:
        data = json.load(f)

    # Fix categories
    data['categories'] = [
        {'id': 0, 'name': 'artefact', 'supercategory': 'none'}
    ]

    # Fix category_ids in annotations
    for ann in data['annotations']:
        ann['category_id'] = 0

    with open(coco_path, 'w') as f:
        json.dump(data, f)
    print(f"Normalized categories in: {coco_path}")

normalize_categories('/Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/train/_annotations.coco.json')
normalize_categories('/Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/valid/_annotations.coco.json')
normalize_categories('/Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/test/_annotations.coco.json')


Normalized categories in: /Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/train/_annotations.coco.json
Normalized categories in: /Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/valid/_annotations.coco.json
Normalized categories in: /Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/test/_annotations.coco.json


### Cleaning the data

In [10]:
import json
from pathlib import Path

def clean_coco_json(json_path):
    with open(json_path) as f:
        data = json.load(f)

    # Deduplicate category
    data['categories'] = [{'id': 0, 'name': 'artefact', 'supercategory': 'none'}]

    # Fix category IDs
    for ann in data['annotations']:
        ann['category_id'] = 0

    # Reassign image IDs
    id_map = {}
    for new_id, img in enumerate(data['images']):
        old_id = img['id']
        id_map[old_id] = new_id
        img['id'] = new_id

    for ann in data['annotations']:
        ann['image_id'] = id_map[ann['image_id']]

    # Remove images with no annotations (only if it's val/test!)
    ann_ids = {ann['image_id'] for ann in data['annotations']}
    data['images'] = [img for img in data['images'] if img['id'] in ann_ids]

    # Save
    with open(json_path, 'w') as f:
        json.dump(data, f, indent=2)

    print(f"Cleaned: {json_path}")

# Run on all 3
base_path = Path('/Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/')
clean_coco_json(base_path / 'train/_annotations.coco.json')
clean_coco_json(base_path / 'valid/_annotations.coco.json')
clean_coco_json(base_path / 'test/_annotations.coco.json')


Cleaned: /Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/train/_annotations.coco.json
Cleaned: /Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/valid/_annotations.coco.json
Cleaned: /Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/test/_annotations.coco.json


In [11]:
import json
from pathlib import Path

def fix_annotation_ids(json_path):
    with open(json_path) as f:
        data = json.load(f)

    # Reassign unique annotation IDs
    for new_id, ann in enumerate(data['annotations']):
        ann['id'] = new_id + 1

    with open(json_path, 'w') as f:
        json.dump(data, f, indent=2)

    print(f"Fixed annotation IDs in: {json_path}")

# Just for train (the one throwing the error)
fix_annotation_ids('/Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/train/_annotations.coco.json')
fix_annotation_ids('/Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/valid/_annotations.coco.json')
fix_annotation_ids('/Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/test/_annotations.coco.json')


Fixed annotation IDs in: /Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/train/_annotations.coco.json
Fixed annotation IDs in: /Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/valid/_annotations.coco.json
Fixed annotation IDs in: /Users/jbm/Documents/DSAN_6500/WatchdogAI/data_artifacts/test/_annotations.coco.json
