In [17]:
# import the relevant libraries
import numpy as np
import cv2
from mss import mss
from PIL import Image

import torch
import torchvision

import ctypes
import time
import win32api, win32con
from utils.torch_utils import select_device
from utils.general import non_max_suppression
from utils.augmentations import letterbox

from models.common import DetectMultiBackend

#User parameters
show = True # boolean if the user wants to see the model output
control = False # boolean if the user wants the model to control the input device

DEVICE = select_device('') # a method which returns gpu availability, provided by YOLOv5
LOAD_MODEL_FILE = "Halo640v2.pt" # the saved model file name
image_size = 1280 # size of the output screen
auto_screen = False # boolean should the program find the screen size automatically


centering = 1 #centering is the decimal percentage of the screen to account for, from the center, from 0 to 1

threshold = .8 # threshold is the lowest confidence bound to be allowed to be considered a detection

movement_scale =  640/image_size # a scaling factor based on the image size, required for proper mouse control.

YOLOv5  2022-4-17 torch 1.11.0 CUDA:0 (NVIDIA GeForce RTX 3070 Laptop GPU, 8192MiB)



In [18]:
model = torch.hub.load('ultralytics/yolov5', 'custom', path=LOAD_MODEL_FILE) # load the model trained on our Halo dataset
model.eval() # set the model to evaluation mode

Using cache found in C:\Users\blain/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2022-4-17 torch 1.11.0 CUDA:0 (NVIDIA GeForce RTX 3070 Laptop GPU, 8192MiB)

Fusing layers... 
Model summary: 213 layers, 7012822 parameters, 0 gradients
Adding AutoShape... 


AutoShape(
  (model): DetectMultiBackend(
    (model): Model(
      (model): Sequential(
        (0): Conv(
          (conv): Conv2d(3, 32, kernel_size=(6, 6), stride=(2, 2), padding=(2, 2))
          (act): SiLU(inplace=True)
        )
        (1): Conv(
          (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (act): SiLU(inplace=True)
        )
        (2): C3(
          (cv1): Conv(
            (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1))
            (act): SiLU(inplace=True)
          )
          (cv2): Conv(
            (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1))
            (act): SiLU(inplace=True)
          )
          (cv3): Conv(
            (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
            (act): SiLU(inplace=True)
          )
          (m): Sequential(
            (0): Bottleneck(
              (cv1): Conv(
                (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1))
           

In [19]:
# if the user wants to
if auto_screen:
    w, h = ctypes.windll.user32.GetSystemMetrics(0), ctypes.windll.user32.GetSystemMetrics(1)
else:
    w, h = 1280,807 # if the user wants to define the screen size, do so here

# find the center point of the screen
x_center = w/2
y_center = h/2

# because we want to be able to take screencaptures that do not use the entire screen, we need to define the top, bottom, left, and right bounds of the box in pixels
top = int(y_center - (y_center * centering))
left = int(x_center - (x_center * centering))
bottom = int(y_center + (y_center * centering))
right = int(x_center + (x_center * centering))

width = int(w * centering)
height = int(h * centering)

# we save these as a dictionary to be used with
monitor = {'top': top, 'left': left, 'width': width, 'height': height}

sct = mss() #instantiate the screenshot application

In [20]:

# Begin the loop of screen inference
while True:
    start = time.time() # begin a timer to calculate framerate

    orig_img = sct.grab(monitor) # capture the image from the screen defined in the monitor dict
    img = Image.frombytes('RGB', (orig_img.size.width, orig_img.size.height), orig_img.rgb) # perform several conversions to prepare the image for the model

    with torch.no_grad():
        pred = model(img).pred # perform inference of the image.  The

    img = np.array(img.convert('RGB')) # convert the image back to the correct color space and

    framerate = 1/(time.time() - start) # stop the timer and calculate the framerate

    cv2.putText(img, str(framerate), (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2, cv2.LINE_AA) #add framerate to the cv2 image

    if len(pred) > 0: # if there were any predicted targets
        closest = 99999 # set an arbitrarily large value for distance to the center
        for box in pred[0]: # loop through all predicted targets
            if box[4] > threshold: # if the prediction is above the user set confidence threshold
                box_dim = (box[3] - box[1])  #Caclulate the box dimension to be used to scale inputs later on
                x = (box[2] + box[0]) / 2 # find the x pixel value for the target
                y = ((box[3] + box[1]) / 2) - ((box[3] - box[1]) / 3) # find the y pixel value for the target (scale up by 1/3 to aim for headshots)

                # add the target to the image
                cv2.circle(img,
                    (int(x), int(y)),
                    radius=10,
                    color=(255, 0, 0),
                    thickness=5
                )

                #get the coordinates in the full size space
                x = x / img.shape[1] * orig_img.size[0]
                y = y / img.shape[0] * orig_img.size[1]

                dist_x = x.item()-x_center
                dist_y = y.item()-y_center
                dist = (dist_x**2 + dist_y**2)**(.5)
                if dist < closest:
                    closest = dist
                    dx = dist_x
                    dy = dist_y
                    best_scale = box_dim/orig_img.size.height


        if control and closest < 99999:
            win32api.mouse_event(win32con.MOUSEEVENTF_MOVE,
                                 int(.5*framerate* movement_scale*best_scale*dx),
                                 int(.5*framerate* movement_scale*best_scale*dy),
                                 )

    if show:
        cv2.imshow('Model View', np.array(img)[:,:,[2,1,0]])

        if cv2.waitKey(1) & 0xFF == ord('q'):
            cv2.destroyAllWindows()
            break

    else:
        print(framerate)