In [None]:

!pip install ultralytics

Load yolo model


In [None]:
from ultralytics import YOLO
import torch
import torch.nn.functional as F

device = "cuda" if torch.cuda.is_available() else "cpu"

yolo = YOLO("/content/drive/MyDrive/Colab Notebooks/MINIPROJECT/best.pt")


Load swin Model

In [None]:
import timm
import torch
import re
import torch.nn as nn
import torch.nn.functional as F

device = "cuda" if torch.cuda.is_available() else "cpu"

ckpt = torch.load("/content/drive/MyDrive/Colab Notebooks/MINIPROJECT/best_fusionnet.pth", map_location="cpu")
ckpt = {re.sub(r"^swin\.", "", k): v for k, v in ckpt.items()}


swin = timm.create_model("swin_base_patch4_window7_224", pretrained=False, num_classes=0)
swin.load_state_dict(ckpt, strict=False)
swin = swin.to(device)
swin.eval()

swin_fc = nn.Linear(1024, NUM_CLASSES).to(device)

def swin_forward(img):
    with torch.no_grad():
        feat = swin.forward_features(img)     # [1, 7, 7, 1024]
        feat = feat.permute(0, 3, 1, 2)       # → [1, 1024, 7, 7]
        feat = F.adaptive_avg_pool2d(feat, (1, 1))  # → [1,1024,1,1]
        feat = feat.view(feat.size(0), -1)    # → [1,1024]
        out = swin_fc(feat)                   # → [1,5]
    return out


Load Maxvit Model

In [None]:
import timm
import torch
import torch.nn as nn
import re

device = "cuda" if torch.cuda.is_available() else "cpu"
NUM_CLASSES = 5

ckpt = torch.load("/content/drive/MyDrive/Colab Notebooks/MINIPROJECT/best_maxvit_rdd2022.pth", map_location="cpu")


ckpt = {re.sub(r"^maxvit\.", "", k): v for k, v in ckpt.items()}

maxvit = timm.create_model("maxvit_tiny_tf_224.in1k", pretrained=False, num_classes=1000)

filtered_weights = {k: v for k, v in ckpt.items() if not k.startswith("head")}
missing, unexpected = maxvit.load_state_dict(filtered_weights, strict=False)

in_features = maxvit.head.fc.in_features
maxvit.head.fc = nn.Linear(in_features, NUM_CLASSES)

maxvit = maxvit.to(device)
maxvit.eval()

print("MaxViT head adjusted to 5 output classes.")
print("MaxViT-Tiny restored successfully!")


In [None]:
CLASS_NAMES = [
    'longitudinal crack',
    'transverse crack',
    'alligator crack',
    'other corruption',
    'Pothole'
]

from torchvision import transforms
transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])


In [None]:
def classify_ensemble(pil_crop, yolo_cls, yolo_conf):
    import torchvision.transforms as T

    transform = T.Compose([
        T.Resize((224,224)),
        T.ToTensor(),
    ])
    img_t = transform(pil_crop).unsqueeze(0).to(device)

    with torch.no_grad():
        p_swin = F.softmax(swin_forward(img_t), dim=1)
        p_max  = F.softmax(maxvit(img_t), dim=1)

        p_cls = (p_swin + p_max) / 2   # classifier average

        yolo_vec = torch.zeros_like(p_cls)
        yolo_vec[0, yolo_cls] = yolo_conf

        # Weighted fusion (Classifier stronger!)
        p = (p_cls * 0.7) + (yolo_vec * 0.3)

        final_cls = torch.argmax(p, dim=1).item()
        final_conf = p[0, final_cls].item()

    return final_cls, final_conf


In [None]:
from ultralytics import YOLO
from PIL import Image

yolo = YOLO("/content/drive/MyDrive/Colab Notebooks/MINIPROJECT/best.pt")

def run_ensemble_on_image(img_path):
    result = yolo(img_path, conf=0.15, iou=0.45)[0]
    boxes = result.boxes

    predictions = []

    for box in boxes:
        x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
        yolo_conf = float(box.conf.item())
        yolo_cls = int(box.cls.item())

        img = Image.open(img_path).convert("RGB")
        crop = img.crop((x1, y1, x2, y2))

        cls, conf = classify_ensemble(crop, yolo_cls, yolo_conf)
        predictions.append((CLASS_NAMES[cls], conf, (x1, y1, x2, y2)))

    return predictions


In [None]:
import os
import glob
import torch
import numpy as np
from PIL import Image
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

IOU_THRESHOLD = 0.4

def compute_iou(box1, box2):
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])

    inter = max(0, x2 - x1) * max(0, y2 - y1)
    area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
    area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
    union = area1 + area2 - inter
    return inter / union if union > 0 else 0


def evaluate_ensemble(test_images_dir, test_labels_dir):
    y_true = []
    y_pred = []
    total = 0
    matched = 0

    image_paths = sorted(glob.glob(test_images_dir + "/*.jpg"))

    for img_path in image_paths:
        label_path = os.path.join(test_labels_dir, os.path.basename(img_path).replace(".jpg", ".txt"))
        if not os.path.exists(label_path):
            continue


        result = yolo(img_path, conf=0.15, iou=0.45)[0]
        boxes = result.boxes

        # Load ground truth
        gt_boxes = []
        img = Image.open(img_path)
        W, H = img.size

        with open(label_path, "r") as f:
            for line in f:
                cls, xc, yc, w, h = map(float, line.split())
                cls = int(cls)
                x1 = (xc - w/2) * W
                y1 = (yc - h/2) * H
                x2 = (xc + w/2) * W
                y2 = (yc + h/2) * H
                gt_boxes.append((cls, (x1,y1,x2,y2)))

        # Evaluate YOLO → Crop → Ensemble Classification
        for box in boxes:
            total += 1
            x1,y1,x2,y2 = map(int, box.xyxy[0].tolist())
            yolo_cls = int(box.cls.item())
            yolo_conf = float(box.conf.item())

            crop = img.crop((x1,y1,x2,y2))
            pred_cls, pred_conf = classify_ensemble(crop, yolo_cls, yolo_conf)

            # Store prediction for metrics
            y_pred.append(pred_cls)

            # Find best matching GT box
            best_iou = 0
            best_gt = None
            for gt_cls, gt_box in gt_boxes:
                iou = compute_iou((x1,y1,x2,y2), gt_box)
                if iou > best_iou:
                    best_iou = iou
                    best_gt = gt_cls

            if best_iou >= IOU_THRESHOLD:
                y_true.append(best_gt)
                if pred_cls == best_gt:
                    matched += 1
            else:
                # no corresponding GT → false detection
                y_true.append(-1)  # background/no match


    y_true = np.array(y_true)
    y_pred = np.array(y_pred)


    mask = y_true != -1
    y_true = y_true[mask]
    y_pred = y_pred[mask]

    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    acc = np.mean(y_true == y_pred) * 100

    print(f"Ensemble Accuracy: {acc:.2f}%")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"F1-Score: {f1:.3f}")


In [None]:
evaluate_ensemble(
    test_images_dir="/content/drive/MyDrive/Colab Notebooks/MINIPROJECT/RDD2022MAIN/RDD_SPLIT/test/images",
    test_labels_dir="/content/drive/MyDrive/Colab Notebooks/MINIPROJECT/RDD2022MAIN/RDD_SPLIT/test/labels"
)
