In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Step 1: Setup and Preparation

!pip install torch torchvision matplotlib seaborn opencv-python
!git clone https://github.com/ultralytics/yolov5

# Change to YOLOv5 directory
%cd yolov5

# Install YOLOv5 dependencies
!pip install -r requirements.txt

In [None]:
# Data Preparation
import os
import random
import shutil
from pathlib import Path
import yaml

# Define paths
DATA_DIR = "/kaggle/input/dlp-object-detection-week-10-may-2025/final_dlp_data/final_dlp_data/train"
TEST_DIR = "/kaggle/input/dlp-object-detection-week-10-may-2025/final_dlp_data/final_dlp_data/test"
TRAIN_SPLIT = "dataset/train_split"
VAL_SPLIT = "dataset/val_split"

# Create directories
os.makedirs(f"{TRAIN_SPLIT}/images", exist_ok=True)
os.makedirs(f"{TRAIN_SPLIT}/labels", exist_ok=True)
os.makedirs(f"{VAL_SPLIT}/images", exist_ok=True)
os.makedirs(f"{VAL_SPLIT}/labels", exist_ok=True)

# Get all images and corresponding label files
image_files = list(Path(f"{DATA_DIR}/images").glob("*.jpeg"))
label_files = list(Path(f"{DATA_DIR}/labels").glob("*.txt"))

# Sort to ensure matching
image_files.sort()
label_files.sort()

# Split into train(80%) and validation(20%)
split_idx = int(0.8 * len(image_files))
random.seed(42)

train_images, val_images = image_files[:split_idx], image_files[split_idx:]
train_labels, val_labels = label_files[:split_idx], label_files[split_idx:]

# Move files to respective directories
for img, lbl in zip(train_images, train_labels):
    shutil.copy(img, f"{TRAIN_SPLIT}/images/{img.name}")
    shutil.copy(lbl, f"{TRAIN_SPLIT}/labels/{lbl.name}")

for img, lbl in zip(val_images, val_labels):
    shutil.copy(img, f"{VAL_SPLIT}/images/{img.name}")
    shutil.copy(lbl, f"{VAL_SPLIT}/labels/{lbl.name}")

print(f"Training images: {len(train_images)}, Validation images: {len(val_images)}")

In [None]:
# Create Dataset Configuration File

DATA_YAML = """
train: dataset/train_split/images
val: dataset/val_split/images

nc: 6
names: ["aegypti", "albopictus", "anopheles", "culex", "culiseta", "japonicus/koreicus"]
"""

with open("dataset.yaml", "w") as f:
    f.write(DATA_YAML)


In [None]:
# Modify Hyperparameters
HYP = """
lr0: 0.001
lrf: 0.2
momentum: 0.937
weight_decay: 0.0005
warmup_epochs: 3.0
hsv_h: 0.015
hsv_s: 0.7
hsv_v: 0.4
degrees: 0.5
translate: 0.1
scale: 0.5
shear: 0.1
mosaic: 1.0
mixup: 0.2
"""

with open("hyper_parameter.yaml", "w") as f:
    f.write(HYP)


In [None]:
%env WANDB_MODE = disabled

In [None]:
#Train the Model

import torch
import warnings
warnings.filterwarnings('ignore')


# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

#Use dual GPU follow instruction structure from  ( https://docs.ultralytics.com/yolov5/tutorials/multi_gpu_training/#single-gpu )
# for Distributed Data Parallel



# Train the model
!python -m torch.distributed.run --nproc_per_node 2 train.py \
    --data dataset.yaml \
    --weights yolov5m.pt \
    --epochs 50 \
    --batch-size 64 \
    --img-size 640 \
    --project mosquito-detection \
    --name experiment \
    --device 0,1 \
    --hyp hyper_parameter.yaml

#If resuming training load last.pt after --weights and use --resume after train.py


In [None]:
# Step 6: Run Inference


!python detect.py \
    --weights /kaggle/working/yolov5/mosquito-detection/experiment9/weights/best.pt \ # set path accordingly. Path will likely be different for you depending on how many times you have run training before prediciting
    --source  /kaggle/input/dlp-object-detection-week-10-may-2025/final_dlp_data/final_dlp_data/test/images \ # This path may also be different
    --img-size 640 \
    --conf-thres 0.3 \
    --iou-thres 0.5 \
    --save-txt \
    --save-conf \
    --project mosquito-detection-results \
    --name inference \
    --agnostic-nms


In [None]:
# Generate Submission File
import pandas as pd
import glob
from collections import defaultdict

# Path to YOLOv5 predictions

predictions_dir = "/kaggle/working/yolov5/mosquito-detection-results/inference5/labels" # Set path accordingly. Name will be available at the end of execution of the cell above this.

submission = []
for pred_file in glob.glob(os.path.join(predictions_dir, "*.txt")):
    image_id = os.path.basename(pred_file).replace(".txt", ".jpeg")
    with open(pred_file, "r") as f:
        for line in f:
            label, x_center, y_center, width, height, conf = map(float, line.split())
            submission.append({
                "id": len(submission),
                "ImageID": image_id,
                "LabelName": ["aegypti", "albopictus", "anopheles", "culex", "culiseta", "japonicus/koreicus"][int(label)],
                "Conf": conf,
                "xcenter": x_center,
                "ycenter": y_center,
                "bbx_width": width,
                "bbx_height": height
            })



# Save to CSV
submission_df = pd.DataFrame(submission)
submission_df.head()

In [None]:
total_duplicate_count = 0
from collections import Counter

# Count the number of occurrences of each ImageID
image_counts = Counter(submission_df['ImageID'])

# Print images with more than one prediction
for image_id, count in image_counts.items():
    if count > 1:
        total_duplicate_count += 1

print(total_duplicate_count)

In [None]:
# Filter predictions to keep the highest confidence per class per image
filtered_submission = []
predictions_by_image = defaultdict(list)

for row in submission:
    predictions_by_image[row["ImageID"]].append(row)

for image_id, preds in predictions_by_image.items():
    unique_classes = defaultdict(list)
    for pred in preds:
        unique_classes[pred["LabelName"]].append(pred)
    for class_preds in unique_classes.values():
        filtered_submission.append(max(class_preds, key=lambda x: x["Conf"]))

# Save to CSV
submission_df = pd.DataFrame(filtered_submission)

print(f"Number of predictions after filtering: {len(filtered_submission)}")

submission_df.head()

In [None]:
total_duplicate_count = 0
from collections import Counter

image_counts = Counter(submission_df['ImageID'])

for image_id, count in image_counts.items():
    if count > 1:
        total_duplicate_count += 1

print(total_duplicate_count)

In [None]:
unique_image_ids = submission_df["ImageID"].nunique()
total_rows = len(submission_df)
print(f"Unique ImageIDs: {unique_image_ids}")
print(f"Total rows: {total_rows}")

In [None]:
import os
from pprint import pprint

# Get all test image IDs
test_image_ids = set(os.listdir("/kaggle/input/dlp-object-detection-week-10-may-2025/final_dlp_data/final_dlp_data/test/images")) #set path accordingly
test_image_ids = {img.replace(".jpeg", "") for img in test_image_ids}


In [None]:

predicted_image_ids = set(submission_df["ImageID"].str.replace(".jpeg", ""))


missing_image_ids = test_image_ids - predicted_image_ids
print(f"Missing ImageIDs: {len(missing_image_ids)}")

In [None]:
prediction_files = set(
    os.path.basename(f).replace(".txt", "") for f in glob.glob(os.path.join(predictions_dir, "*.txt"))
)
missing_in_predictions = missing_image_ids - prediction_files
pprint(f"Images without predictions: {missing_in_predictions}")

In [None]:
from collections import Counter

duplicate_counts = Counter(submission_df["ImageID"])
duplicates = {img_id: count for img_id, count in duplicate_counts.items() if count > 1}
print(f"Overpredicted ImageIDs: {len(duplicates)}")

In [None]:
for img_id in duplicates.keys():
    print(submission_df[submission_df["ImageID"] == img_id])

In [None]:
import random

# List of available class labels
available_classes = ["aegypti", "albopictus", "anopheles", "culex", "culiseta", "japonicus/koreicus"]


# Add default predictions for missing ImageIDs
for img_id in missing_image_ids:
    random_label = random.choice(available_classes)
    submission.append({
        "id": len(submission),
        "ImageID": f"{img_id}.jpeg",
        "LabelName": random_label,  # Assign a random label from available 6 labels
        "Conf": 0.0,  # Confidence of 0
        "xcenter": 0.5,
        "ycenter": 0.5,
        "bbx_width": 0.1,
        "bbx_height": 0.1
    })

# Create DataFrame
submission_df = pd.DataFrame(submission)

print(f"Final Unique ImageIDs: {submission_df['ImageID'].nunique()}")
print(f"Final Total Rows: {len(submission_df)}")

In [None]:
# Filter predictions to keep the highest confidence prediction per image
unique_submission = (
    submission_df.loc[submission_df.groupby("ImageID")["Conf"].idxmax()]
    .reset_index(drop=True)
)

print(f"Final Unique ImageIDs: {unique_submission['ImageID'].nunique()}")
print(f"Final Total Rows: {len(unique_submission)}")

In [None]:
unique_submission.head()

In [None]:
import pandas as pd

sample_submission_path = "/kaggle/input/dlp-object-detection-week-10-may-2025/sample_submission.csv"
sample_submission = pd.read_csv(sample_submission_path)
print(sample_submission.head())


In [None]:
# Merge submission with sample_submission to align ids
final_submission = pd.merge(
    unique_submission.drop(columns=["id"], errors="ignore"),  # Drop existing 'id' to prevent duplicates
    sample_submission[["id", "ImageID"]],
    on="ImageID",
    how="left"
)

# Reorder columns to match sample submission
columns_order = ['id'] + [col for col in final_submission.columns if col != 'id']
final_submission = final_submission[columns_order]
final_submission.head()

In [None]:
len(final_submission)

In [None]:
# Save the filtered submission to CSV
unique_submission = final_submission.copy()
unique_submission.to_csv("submission.csv", index=False)