### Imports


In [1]:
from dataset import*
from utility import*
from baseline import*
from transformer import*
from training import *
import ultralytics
from ultralytics import YOLO
ultralytics.checks()

Ultralytics YOLOv8.1.0 🚀 Python-3.8.18 torch-2.1.2+cu121 CUDA:0 (NVIDIA GeForce RTX 3060 Laptop GPU, 5938MiB)
Setup complete ✅ (12 CPUs, 15.4 GB RAM, 330.8/456.0 GB disk)


In [2]:
print(f"We have {'' if torch.cuda.is_available() else 'not'} access to a GPU")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print(torch.cuda.current_device())
    print(torch.cuda.device(0))
    print(torch.cuda.device_count())
    print(torch.cuda.get_device_name(0))
print(device)

We have  access to a GPU
0
<torch.cuda.device object at 0x7f253a76d490>
1
NVIDIA GeForce RTX 3060 Laptop GPU
cuda


In [3]:
seed_everything(42)

In [4]:
project_folder = '/home/anto/University/Driving-Visual-Attention/'

In [5]:
# Choose size of the eyes
dim = (32,64)
# mean and std of images, calculated in advance
mean = (0.4570, 0.4422, 0.3900)
std = (0.2376, 0.2295, 0.2261)

my_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize(dim, antialias=True),
    transforms.Normalize(mean=mean, std=mean, inplace=True)
])

In [6]:
EPOCHS = 1
BATCH_SIZE = 1
THRESHOLD = 250
bbox_accuracy_class = BBoxAccuracy()
criterion = nn.L1Loss()

In [7]:
save_test_file = '/home/anto/University/Driving-Visual-Attention/save/save_test100'
test_dataset = DGAZEDataset('test',save_test_file, my_transforms, big_file=False)
print(f'Test dataset len is {len(test_dataset)}')
test_dataloader = DataLoader(test_dataset,batch_size=BATCH_SIZE,shuffle=True)

Test dataset len is 22783


### YOLO 


##### Training YOLO

In [8]:
#dataset_path = project_folder + '/EAI_Napoli/datasetCoco/data.yaml'
#!yolo task=detect mode=train model=yolov8m.pt data=$dataset_path epochs=40 imgsz=640 pretrained=True batch=32

##### Testing YOLO

In [9]:
yolo_model_path = project_folder + '/save/yolo_best.pt'
yolo_model = YOLO(yolo_model_path)

In [10]:
image1 = project_folder +'data/images_aligned/driver12/road_view/sample104/frame_0009.jpg'
results = yolo_model(image1)  # return a list of Results objects
result = results[0]
boxes = result.boxes.cpu().numpy()  # Boxes object for bbox outputs
for box in boxes:
    # Extract bounding box coordinates as integers
    bbox = box.xyxy[0].astype(int)
    # Extract the classification name using the class index
    class_index = int(box.cls[0])
    class_name = result.names[class_index] 
    # Create a dictionary for the current bounding box and name
    current_dict = {'bbox': bbox, 'class_name': class_name}
    print(current_dict)


image 1/1 /home/anto/University/Driving-Visual-Attention/data/images_aligned/driver12/road_view/sample104/frame_0009.jpg: 384x640 2 persons, 2 cars, 1 bus_stop, 111.9ms
Speed: 2.0ms preprocess, 111.9ms inference, 330.9ms postprocess per image at shape (1, 3, 384, 640)
{'bbox': array([ 999,  111, 1920, 1012]), 'class_name': 'bus_stop'}
{'bbox': array([221, 578, 264, 690]), 'class_name': 'person'}
{'bbox': array([586, 628, 666, 717]), 'class_name': 'person'}
{'bbox': array([738, 645, 843, 706]), 'class_name': 'car'}
{'bbox': array([865, 627, 968, 701]), 'class_name': 'car'}


### Testing Gaze Estimation+YOLO for Attention

In [11]:
gaze_model = GazeCNN(additional_features_size=7)
checkpoint_path = project_folder + '/save/best_CNN_baseline_64acc.pth'
# Load the checkpoint
checkpoint = torch.load(checkpoint_path)
# Load the model state dictionary
gaze_model.load_state_dict(checkpoint['model_state_dict'])
gaze_model.eval()
gaze_model.to(device)

GazeCNN(
  (eye_feature_extractor): EyeFeatureExtractor(
    (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu): LeakyReLU(negative_slope=0.01)
    (block): ConvolutionBlock(
      (conv_block): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (batch_norm_block): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu_block): LeakyReLU(negative_slope=0.01)
    )
    (pool): MaxPool2d(kernel_size=4, stride=2, padding=0, dilation=1, ceil_mode=False)
    (dropout): Dropout(p=0.1, inplace=False)
    (conv2): Conv2d(16, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  )
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (mlp_head): MLPHead(
    (fc_additional): Sequential(
      (0): Linear(in_features=7, out_features=16, bias=True)
      (1): LeakyReLU(negative_slope=0.01)
    )
    (fc_merged): Sequential(
      (0): Linear(in_features=688, out_features=64, bias=True)
      (1): LeakyReLU(nega

In [12]:
for epoch in range(EPOCHS):
    # Testing
    test_loss, test_accuracy, bbox_accuracy, error = validate(gaze_model, bbox_accuracy_class , test_dataloader, THRESHOLD, criterion, device, epoch, BATCH_SIZE)
    #log_image(val_loader, model, device)

print("test_loss", test_loss)
print("accuracy_threshold", test_accuracy*100)
print("accuracy_bbox", bbox_accuracy*100)
print("error", error)

Validation Epoch 0: 100%|██████████| 46/46 [00:09<00:00,  4.71batch/s, batch accuracy=20.20%]

test_loss 228.46461254617444
accuracy_threshold 37.56521924034409
accuracy_bbox 15.969565154417701
accuracy_paper(error) 371.93436232857084





##### Attention

In [13]:
do_Plot = False

In [15]:
all_results = []  # To store the results for all images
for eye,features,_, true_bbox, driver_path in tqdm(test_dataloader):
    eye, features = eye.to(device), features.to(device)
    # Estimate the gaze 
    estimated_gaze = gaze_model(eye,features).squeeze(0).tolist()
    estimated_gaze = tuple(estimated_gaze)
    # Check if the gaze point is inside any true bounding box
    true_bbox = tuple(true_bbox.squeeze(0).tolist())
    is_inside_true_bbox = is_point_inside_bbox(estimated_gaze, true_bbox)
    
    # Run YOLO
    road_path =  driver_path[0].replace('driver_view', 'road_view')
    results = yolo_model(road_path, verbose=False)
    result = results[0] # we pass only one image at a time
    yolo_boxes = result.boxes.cpu().numpy()
    names = result.names
    for box in yolo_boxes:
        # Extract bounding box coordinates as integers
        bbox = box.xyxy[0].astype(int)
        # Extract the classification name using the class index
        class_index = int(box.cls[0])
        class_name = result.names[class_index] 
        # Check if point is inside the bbox
        is_inside_yolo_bbox = is_point_inside_bbox(estimated_gaze,bbox)
        if is_inside_yolo_bbox and is_inside_true_bbox:
            attention_score = 2
            current_dict = {
                'image_path': driver_path,
                'attention_score': attention_score,
                'obj_name': class_name
            }
            break
        elif is_inside_yolo_bbox and not is_inside_true_bbox:
            attention_score = 1
            current_dict = {
                'image_path': driver_path,
                'attention_score': attention_score,
                'obj_name': class_name
            }
            break
        else:
            continue
    
    if not is_inside_true_bbox and not is_inside_yolo_bbox:
        attention_score = 0
        current_dict = {
            'image_path': driver_path,
            'attention_score': attention_score,
        }

    if do_Plot:
        if 'obj_name' in current_dict:
            ### PLOT
            road_photo = cv2.imread(road_path)
            road_photo = cv2.cvtColor(road_photo, cv2.COLOR_BGR2RGB)
            plt.imshow(road_photo)
            plt.axis('off')
            fig, ax = plt.subplots(1)
            gaze_x, gaze_y = estimated_gaze
            ax.plot(gaze_x, gaze_y, 'ro', markersize=25)
            for box in yolo_boxes:
                bbox = box.xyxy[0].astype(int)
                rect = patches.Rectangle((bbox[0], bbox[1]), bbox[2] - bbox[0], bbox[3] - bbox[1],
                                        linewidth=2, edgecolor='lightcoral', facecolor='none')
                ax.add_patch(rect)
            # Add text annotation for attention score and object name
            object = current_dict['obj_name']
            ax.text(10, 10, f'Attention Score: {attention_score}\nObject: {object}',
                    color='red', fontsize=10, bbox=dict(facecolor='black', alpha=0.7))
            ax.imshow(road_photo)
            ax.axis('off')
            plt.show()
            break
        else:
            continue

    # Append the dictionary to the list of results
    all_results.append(current_dict)

  0%|          | 0/46 [00:00<?, ?it/s]


ValueError: too many values to unpack (expected 2)

In [None]:
from collections import Counter
# Extract the class names from the 'obj_name' key (if it exists)
class_names = [item.get('obj_name', 'Gaze outside any box') for item in all_results]

# Count the occurrences of each class
class_counts = Counter(class_names)

# Display the result
for class_name, count in class_counts.items():
    print(f'{class_name}: {count/len(all_results)*100:.2f}%')

In [None]:
correct = 0
wrong = 0
semi = 0
for dict in all_results:
    attention_score = dict.get('attention_score')
    if attention_score == 2:
        correct += 1
    if attention_score == 0:
        wrong +=1
    if attention_score == 1:
        semi +=1
print(f'Inside correct bbox: {correct/len(all_results)*100:.2f}%')
print(f'Inside another bbox: {semi/len(all_results)*100:.2f}%')
print(f'Inside NO bbox: {wrong/len(all_results)*100:.2f}%')