### Requirements

In [1]:
from dataset import*
from utility import*
from training import *
from vit_pytorch.vit import ViT

In [2]:
root_project = '/home/anto/University/Driving-Visual-Attention/'

In [3]:
print(f"We have {'' if torch.cuda.is_available() else 'not'} access to a GPU")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print(torch.cuda.current_device())
    print(torch.cuda.device(0))
    print(torch.cuda.device_count())
    print(torch.cuda.get_device_name(0))
print(device)

We have not access to a GPU
cpu


  return torch._C._cuda_getDeviceCount() > 0


In [4]:
seed_everything(42)

### Data Loader and Visualization

##### Files where to write the paths and labels

In [5]:
percentage = 1
save_train_file = root_project + 'save/save_train' + str(percentage)
save_val_file = root_project + 'save/save_val' + str(percentage)
save_test_file = root_project + 'save/save_test' + str(percentage)

##### Train Loader

In [6]:
train_dataset_classloader = DataLoaderVisualizer(root_project,save_train_file,percentage,'train')

Building path structure

The dataset has already been prepared, ready to use


##### Validtion Loader

In [7]:
val_dataset_classloader = DataLoaderVisualizer(root_project,save_val_file,percentage,'val')

Building path structure

The dataset has already been prepared, ready to use


##### Test Loader

In [8]:
test_dataset_classloader = DataLoaderVisualizer(root_project,save_test_file,percentage,'test')

Building path structure

The dataset has already been prepared, ready to use


##### Visualization

In [9]:
#train_dataset_classloader.visualize_dataset()

In [10]:
#val_dataset_classloader.visualize_dataset()

In [11]:
#test_dataset_classloader.visualize_dataset()

### Pytorch Dataset 

In [12]:
# Crop and convert to tensor
crop_params = (125, 75, 768, 768)

my_transforms = transforms.Compose([
    transforms.ToTensor(),
    CropTransform(crop_params),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

In [13]:
train_dataset = DGAZEDataset('train','save/save_train1',my_transforms)
print(f'Train dataset len is {len(train_dataset)}')

Train dataset len is 1054


In [14]:
val_dataset = DGAZEDataset('val','save/save_val1',my_transforms)
print(f'Val dataset len is {len(val_dataset)}')

Val dataset len is 127


In [15]:
test_dataset = DGAZEDataset('test','save/save_test1',my_transforms)
print(f'Test dataset len is {len(test_dataset)}')

Test dataset len is 126


### Vision Transformer Model

##### Hyerparameters

In [16]:
BATCH_SIZE = 32
EPOCHS = 15
IMAGE_SIZE = 768
HEADS = 6
DROPOUT = 0.1
PATCH_SIZE = 32
DIM = 1024
MLP_DIM= 1024
DEPTH = 3

In [17]:
model = ViT(
    image_size = IMAGE_SIZE,
    patch_size = PATCH_SIZE,
    num_classes = 2,
    dim = DIM,
    depth = DEPTH,
    heads = HEADS,
    mlp_dim = MLP_DIM,
    dropout = DROPOUT,
    emb_dropout = 0.1
)
model.to(device)

ViT(
  (to_patch_embedding): Sequential(
    (0): Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=32, p2=32)
    (1): LayerNorm((3072,), eps=1e-05, elementwise_affine=True)
    (2): Linear(in_features=3072, out_features=1024, bias=True)
    (3): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (transformer): Transformer(
    (norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (layers): ModuleList(
      (0-2): 3 x ModuleList(
        (0): Attention(
          (norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (attend): Softmax(dim=-1)
          (dropout): Dropout(p=0.1, inplace=False)
          (to_qkv): Linear(in_features=1024, out_features=1152, bias=False)
          (to_out): Sequential(
            (0): Linear(in_features=384, out_features=1024, bias=True)
            (1): Dropout(p=0.1, inplace=False)
          )
        )
        (1): FeedForward(
          (net): Sequential(
      

##### Criterion and Optimizer

In [18]:
#criterion = nn.MSELoss()
criterion = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

##### Dataloader

In [19]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [20]:
print(len(val_loader))

4


### Training 

In [None]:
import wandb
wandb.login()
# Percentage of validation data to log to wandb for visualization
random_percentage = 0.2  

In [None]:
wandb.init(project="GazeViT", name="train 1 percent 15 epoche")

In [None]:
for epoch in range(EPOCHS):
    # Training
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    wandb.log({"train_loss": train_loss, "epoch": epoch + 1})

    # Validation
    val_loss, val_accuracy = validate(model, val_loader, criterion, device)
    wandb.log({"val_loss": val_loss, "val_accuracy": val_accuracy, "epoch": epoch + 1})

    with torch.no_grad():
        model.eval()

        # Log example images and predictions 
        example_batch = next(iter(val_loader))
        example_images, example_labels, img_paths = example_batch

        # Calculate gaze point prediction
        example_predictions = model(example_images.to(device))

        # Convert tensors to numpy arrays for visualization
        example_images_np = example_images.detach().numpy()
        example_predictions_np = example_predictions.cpu().detach().numpy()
        example_labels_np = example_labels.detach().numpy()

        # Log a random percentage of images with predictions
        random_indices = log_random_images(example_images_np, random_percentage)

        images_with_predictions = []

        for idx in random_indices:
            img, pred, label, img_path = (
                example_images_np[idx],
                example_predictions_np[idx],
                example_labels_np[idx],
                img_paths[idx],
            )

            # Load the road_view image
            respective_road_view = img_path.replace('driver_view', 'road_view')
            
            # Mark the road-view with the prediction
            road_view_image = mark_image(respective_road_view, tuple(pred.astype(int)), tuple(label.astype(int)))

            # Log the annotated road_view image to WandB
            images_with_predictions.append(
                wandb.Image(
                    road_view_image,
                    caption=f"Prediction: {pred}, Actual: {label}",
                )
            )

        wandb.log({"examples": images_with_predictions, "epoch": epoch + 1})

# Finish the WandB run
wandb.finish()

### Test

In [None]:
torch.save(model.state_dict(), root_project + 'save/model_weights.pth')