### Imports


In [1]:
from dataset import*
from utility import*
from baseline import*
from transformer import*
import ultralytics
from ultralytics import YOLO
ultralytics.checks()

Ultralytics YOLOv8.1.0 🚀 Python-3.8.18 torch-2.1.2+cu121 CUDA:0 (NVIDIA GeForce RTX 3060 Laptop GPU, 5938MiB)
Setup complete ✅ (12 CPUs, 15.4 GB RAM, 329.8/456.0 GB disk)


In [2]:
print(f"We have {'' if torch.cuda.is_available() else 'not'} access to a GPU")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print(torch.cuda.current_device())
    print(torch.cuda.device(0))
    print(torch.cuda.device_count())
    print(torch.cuda.get_device_name(0))
print(device)

We have  access to a GPU
0
<torch.cuda.device object at 0x7f16743e4bb0>
1
NVIDIA GeForce RTX 3060 Laptop GPU
cuda


In [3]:
seed_everything(42)

In [4]:
project_folder = '/home/anto/University/Driving-Visual-Attention/'

In [5]:
# Choose size of the eyes
dim = (32,64)
# mean and std of images, calculated in advance
mean = (0.4570, 0.4422, 0.3900)
std = (0.2376, 0.2295, 0.2261)

my_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize(dim, antialias=True),
    transforms.Normalize(mean=mean, std=mean, inplace=True)
])

In [6]:
save_test_file = '/home/anto/University/Driving-Visual-Attention/save/save_test100'
test_dataset = DGAZEDataset('test',save_test_file, my_transforms)
print(f'Test dataset len is {len(test_dataset)}')
test_dataloader = DataLoader(test_dataset,1)

Test dataset len is 22783


### YOLO 


##### Training YOLO

In [7]:
#dataset_path = project_folder + '/EAI_Napoli/datasetCoco/data.yaml'
#!yolo task=detect mode=train model=yolov8m.pt data=$dataset_path epochs=40 imgsz=640 pretrained=True batch=32

##### Testing YOLO

In [8]:
yolo_model_path = project_folder + '/YOLO runs/runs/detect/train7/weights/best.pt'
yolo_model = YOLO(yolo_model_path)

In [9]:
image1 = project_folder +'data/images_aligned/driver12/road_view/sample104/frame_0009.jpg'
results = yolo_model(image1)  # return a list of Results objects
result = results[0]
boxes = result.boxes.cpu().numpy()  # Boxes object for bbox outputs
for box in boxes:
    # Extract bounding box coordinates as integers
    bbox = box.xyxy[0].astype(int)
    # Extract the classification name using the class index
    class_index = int(box.cls[0])
    class_name = result.names[class_index] 
    # Create a dictionary for the current bounding box and name
    current_dict = {'bbox': bbox, 'class_name': class_name}
    print(current_dict)


image 1/1 /home/anto/University/Driving-Visual-Attention/data/images_aligned/driver12/road_view/sample104/frame_0009.jpg: 384x640 4 persons, 1 car, 1 bus_stop, 108.6ms
Speed: 1.8ms preprocess, 108.6ms inference, 358.9ms postprocess per image at shape (1, 3, 384, 640)
{'bbox': array([1005,  114, 1920,  989]), 'class_name': 'bus_stop'}
{'bbox': array([227, 578, 263, 694]), 'class_name': 'person'}
{'bbox': array([851, 633, 971, 701]), 'class_name': 'car'}
{'bbox': array([1307,  342, 1416,  452]), 'class_name': 'person'}
{'bbox': array([ 73, 565, 116, 655]), 'class_name': 'person'}
{'bbox': array([589, 597, 653, 718]), 'class_name': 'person'}


### Testing Gaze Estimation+YOLO for Attention

In [10]:
gaze_model = GazeCNN(additional_features_size=7)
checkpoint_path = project_folder + '/save/baseline_epochs3_250.pth'
# Load the checkpoint
checkpoint = torch.load(checkpoint_path)
# Load the model state dictionary
gaze_model.load_state_dict(checkpoint['model_state_dict'])
gaze_model.eval()

GazeCNN(
  (eye_feature_extractor): EyeFeatureExtractor(
    (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu): LeakyReLU(negative_slope=0.01)
    (block): ConvolutionBlock(
      (conv_block): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (batch_norm_block): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu_block): LeakyReLU(negative_slope=0.01)
    )
    (pool): MaxPool2d(kernel_size=4, stride=2, padding=0, dilation=1, ceil_mode=False)
    (dropout): Dropout(p=0.1, inplace=False)
    (conv2): Conv2d(16, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  )
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (mlp_head): MLPHead(
    (fc_additional): Sequential(
      (0): Linear(in_features=7, out_features=16, bias=True)
      (1): LeakyReLU(negative_slope=0.01)
    )
    (fc_merged): Sequential(
      (0): Linear(in_features=688, out_features=128, bias=True)
      (1): LeakyReLU(neg

##### Attention

In [12]:
all_results = []  # To store the results for all images
total_hits = 0  # To count the total number of times the point was inside any bounding box

for eye,features,_, true_bbox, driver_path in tqdm(test_dataloader):
    # Estimate the gaze 
    estimated_gaze = gaze_model(eye,features).squeeze(0).tolist()
    estimated_gaze = tuple(estimated_gaze)
    # Check if the gaze point is inside any true bounding box
    true_bbox = tuple(true_bbox.squeeze(0).tolist())
    is_inside_true_bbox = is_point_inside_bbox(estimated_gaze, true_bbox)
    
    # Run YOLO
    road_path =  driver_path[0].replace('driver_view', 'road_view')
    results = yolo_model(road_path, verbose=False)
    result = results[0] # we pass only one image at a time
    yolo_boxes = result.boxes.cpu().numpy()
    names = result.names
    for box in yolo_boxes:
        # Extract bounding box coordinates as integers
        bbox = box.xyxy[0].astype(int)
        # Extract the classification name using the class index
        class_index = int(box.cls[0])
        class_name = result.names[class_index] 
        # Create a dictionary for the current bounding box and name
        current_dict = {'bbox': bbox, 'class_name': class_name}
        # Check if point is inside the bbox
        is_inside_yolo_bbox = is_point_inside_bbox(estimated_gaze,bbox)
        if is_inside_yolo_bbox and is_inside_true_bbox:
            attention_score = 2
            current_dict = {
                'image_path': driver_path,
                'attention_score': attention_score,
                'obj_name': class_name
            }
            break
        elif is_inside_yolo_bbox and not is_inside_true_bbox:
            attention_score = 1
            current_dict = {
                'image_path': driver_path,
                'attention_score': attention_score,
                'obj_name': class_name
            }
            break
        else:
            continue
    
    if not is_inside_true_bbox and not is_inside_yolo_bbox:
        attention_score = 0

    # Create a dictionary for the current image
    current_dict = {
        'image_path': driver_path,
        'attention_score': attention_score,
    }

    # Append the dictionary to the list of results
    all_results.append(current_dict)

    # Update total_hits based on attention_score
    if attention_score > 0:
        total_hits += 1

# Print the total number of hits
print(f"Total hits: {total_hits}")

# Print the results for all images
print("Results:")
for result in all_results:
    print(result)

100%|██████████| 22783/22783 [13:24<00:00, 28.32it/s]


Total hits: 7974
Results:
{'image_path': ('/home/anto/University/Driving-Visual-Attention/data/images_aligned/driver14/driver_view/sample56/frame_0043.jpg',), 'attention_score': 2}
{'image_path': ('/home/anto/University/Driving-Visual-Attention/data/images_aligned/driver14/driver_view/sample56/frame_0096.jpg',), 'attention_score': 0}
{'image_path': ('/home/anto/University/Driving-Visual-Attention/data/images_aligned/driver14/driver_view/sample56/frame_0031.jpg',), 'attention_score': 2}
{'image_path': ('/home/anto/University/Driving-Visual-Attention/data/images_aligned/driver14/driver_view/sample56/frame_0087.jpg',), 'attention_score': 0}
{'image_path': ('/home/anto/University/Driving-Visual-Attention/data/images_aligned/driver14/driver_view/sample56/frame_0008.jpg',), 'attention_score': 0}
{'image_path': ('/home/anto/University/Driving-Visual-Attention/data/images_aligned/driver14/driver_view/sample56/frame_0001.jpg',), 'attention_score': 0}
{'image_path': ('/home/anto/University/Drivi