## Distillation

In [None]:
import torch
from ultralytics import YOLO
import os

model = YOLO("models\\trained_yolov8m.pt") 


# Run inference on training set to create pseudo-labels
results = model.predict(
    source="datasets\\100_images",  # path to folder with images
    save=False,                    # saves images with predictions (optional)
    save_txt=True,                # saves predictions in YOLO format (.txt)
    save_conf=True,               # includes confidence score       
    name="pseudo_labels",        # output folder: runs/detect/pseudo_labels
    exist_ok=True                 # overwrite if already exists
)


image 1/100 c:\Users\yurim\Documents\University\UM\Year_3\Bachelor_Thesis\Weed_Detection_eSys\datasets\100_images\ave-0045-0012.jpg: 384x640 14 Crops, 11 Weeds, 49.7ms
image 2/100 c:\Users\yurim\Documents\University\UM\Year_3\Bachelor_Thesis\Weed_Detection_eSys\datasets\100_images\ave-0047-0003.jpg: 384x640 15 Weeds, 10.8ms
image 3/100 c:\Users\yurim\Documents\University\UM\Year_3\Bachelor_Thesis\Weed_Detection_eSys\datasets\100_images\ave-0047-0017.jpg: 384x640 23 Weeds, 12.1ms
image 4/100 c:\Users\yurim\Documents\University\UM\Year_3\Bachelor_Thesis\Weed_Detection_eSys\datasets\100_images\ave-0083-0005.jpg: 384x640 1 Crop, 1 Weed, 10.0ms
image 5/100 c:\Users\yurim\Documents\University\UM\Year_3\Bachelor_Thesis\Weed_Detection_eSys\datasets\100_images\ave-0083-0030.jpg: 384x640 2 Crops, 2 Weeds, 10.6ms
image 6/100 c:\Users\yurim\Documents\University\UM\Year_3\Bachelor_Thesis\Weed_Detection_eSys\datasets\100_images\ave-0105-0018.jpg: 384x640 2 Crops, 16 Weeds, 11.7ms
image 7/100 c:\Use

In [None]:
import torch.nn.functional as F

def compute_detection_loss(predictions, targets):
    """
    Compute the detection loss for YOLO models.
    :param predictions: Output from the student model.
    :param targets: Ground truth labels.
    :return: Total detection loss.
    """
    # Example: Combine classification, objectness, and bounding box losses
    cls_loss = F.cross_entropy(predictions['cls_logits'], targets['cls_labels'])
    obj_loss = F.binary_cross_entropy(predictions['obj_scores'], targets['obj_scores'])
    bbox_loss = F.mse_loss(predictions['bbox_coords'], targets['bbox_coords'])

    total_loss = cls_loss + obj_loss + bbox_loss
    return total_loss

In [None]:
import torch
import torch.nn.functional as F
from ultralytics import YOLO
from Dataloader import CustomDataset
from torch.utils.data import DataLoader

# Load and extract the models
teacher_wrapper = YOLO('models\\trained_yolov8m.pt') # teacher model
teacher_model = teacher_wrapper.model
teacher_model.eval() 

student_wrapper = YOLO('models\custom_model.yaml') # student model
student_model = student_wrapper.model
student_model.train()

# Create dataset and dataloader
dataset = CustomDataset(annotations_dir='datasets\yolo_CropOrWeed2\labels\\train', img_dir='datasets\yolo_CropOrWeed2\images\\train', transform=None)
data_loader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=CustomDataset.collate_fn)

# Hyperparameters for distillation
T = 2.0           # Temperature for softening logits
lambda_kd = 0.5   # Weight for the distillation loss

optimizer = torch.optim.Adam(student_model.parameters(), lr=1e-4)

for images, labels in data_loader:
    images = images.to('cuda') 
    labels = labels.to('cuda')
    
    # Teacher output (no gradient tracking)
    with torch.no_grad():
        teacher_outputs = teacher_model(images)
        teacher_logits = teacher_outputs['cls_logits']  # Adjust index as needed
    
    # Student output
    student_outputs = student_model(images)
    student_logits = student_outputs['cls_logits']  # Adjust based on your model structure 

    # Apply temperature scaling
    teacher_soft = F.softmax(teacher_logits / T, dim=1)
    student_log_soft = F.log_softmax(student_logits / T, dim=1)
    
    # Compute the distillation loss (KL-divergence)
    loss_kd = F.kl_div(student_log_soft, teacher_soft, reduction='batchmean') * (T * T)
    
    # Compute the standard detection loss (your custom YOLO loss function)
    # loss_detection = compute_detection_loss(student_outputs, labels) 
    loss_detection = compute_detection_loss(student_outputs, labels)  # TODO see if works

    # Combine the losses
    loss_total = loss_detection + lambda_kd * loss_kd
    
    optimizer.zero_grad()
    loss_total.backward()
    optimizer.step()
    
    print(f"Detection Loss: {loss_detection.item():.4f} | KD Loss: {loss_kd.item():.4f} | Total Loss: {loss_total.item():.4f}")


RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same

In [None]:
import torch
import torch.nn.functional as F
from ultralytics import YOLO
from Dataloader import CustomDataset
from torch.utils.data import DataLoader

# Load teacher and student models
device = 'cuda'
teacher = YOLO('models/trained_yolov8m.pt').model.to(device).eval()
student = YOLO('models/custom_model.yaml').model.to(device).train()

# DataLoader setup
dataset = CustomDataset(
    annotations_dir='datasets\yolo_100_images\labels',
    img_dir='datasets\yolo_100_images\images',
    transform=None
)
loader = DataLoader(
    dataset,
    batch_size=4,
    shuffle=True,
    collate_fn=CustomDataset.collate_fn
)

# Distillation hyperparameters
T = 2.0                      # Softmax temperature
lambda_kd = 0.5              # Distillation loss weight
optimizer = torch.optim.Adam(student.parameters(), lr=1e-4)

# Detection loss using Ultralytics implementation
def compute_detection_loss(model, imgs, labels):
    preds = model(imgs)
    loss, _ = model.loss(preds, labels)
    return loss

# Extract class logits from a single head and flatten to [N, C]
def extract_cls_flat(model_outputs, head_index=-1):
    # model_outputs may be (preds_list, features)
    preds = model_outputs[0] if isinstance(model_outputs, tuple) else model_outputs
    out = preds[head_index]        # e.g. tensor of shape (B, A, H, W, 5+nc)
    cls_logits = out[..., 5:]      # drop box coords & objectness → (B,A,H,W,C)
    B, A, H, W, C = cls_logits.shape
    flat = cls_logits.reshape(B * A * H * W, C)
    return flat

# Training loop
for imgs, labels in loader:
    imgs = imgs.to(device)
    labels = [lb.to(device) for lb in labels]

    # Teacher forward
    with torch.no_grad():
        teacher_out = teacher(imgs)
    # Student forward
    student_out = student(imgs)

    # Flatten class logits
    teacher_flat = extract_cls_flat(teacher_out)
    student_flat = extract_cls_flat(student_out)

    # Compute distillation loss (KL divergence)
    t = T
    log_p_s = F.log_softmax(student_flat / t, dim=1)
    p_t = F.softmax(teacher_flat / t, dim=1)
    loss_kd = F.kl_div(log_p_s, p_t, reduction='batchmean') * (t * t)

    # Compute detection loss
    loss_det = compute_detection_loss(student, imgs, labels)

    # Combine losses
    loss = loss_det + lambda_kd * loss_kd

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f"Det Loss: {loss_det.item():.4f} | KD Loss: {loss_kd.item():.4f} | Total: {loss.item():.4f}")


ValueError: not enough values to unpack (expected 5, got 2)

In [None]:
import torch
import torch.nn.functional as F
from ultralytics import YOLO
from Dataloader import CustomDataset
from torch.utils.data import DataLoader

# Load teacher and student as Ultralytics YOLO wrappers
device = 'cuda'
teacher = YOLO('models/trained_yolov8m.pt').model.to(device).eval()
student = YOLO('models/custom_model.yaml').model.to(device).train()

# DataLoader
dataset = CustomDataset(
    annotations_dir='datasets\yolo_100_images\labels',
    img_dir='datasets\yolo_100_images\images',
    transform=None
)
loader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=CustomDataset.collate_fn)

# Hyperparameters
lambda_feat = 0.5  # feature distillation weight
default_lr = 1e-4
optimizer = torch.optim.Adam(student.model.parameters(), lr=default_lr)

# Detection loss using wrapper
def compute_detection_loss(wrapper, imgs, labels):
    preds = wrapper(imgs)               # wrapper returns (preds, features)
    loss, _ = wrapper.loss(preds, labels)
    return loss

# Feature extractor layers
teach_feat_layer = teacher.model[:3]  # first modules
stud_feat_layer  = student.model[:3]

# Training loop
for imgs, labels in loader:
    imgs = imgs.to(device)
    labels = [l.to(device) for l in labels]

    # Teacher features
    with torch.no_grad():
        t_feats = teach_feat_layer(imgs)

    # Student features
    s_feats = stud_feat_layer(imgs)

    # Detection loss via wrapper
    loss_det = compute_detection_loss(student, imgs, labels)

    # Feature distillation loss
    loss_feat = F.mse_loss(s_feats, t_feats)

    # Total loss
    total_loss = loss_det + lambda_feat * loss_feat

    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()

    print(f"Det Loss: {loss_det.item():.4f} | Feat Loss: {loss_feat.item():.4f} | Total: {total_loss.item():.4f}")


RuntimeError: shape '[5, 66, -1]' is invalid for input of size 5