In [None]:
!pip install datasets==2.9.0 transformers nltk rouge_score nvidia-ml-py3

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

## Load the scientific_papers dataset from HuggingFace

In [None]:
from datasets import load_dataset
# dataset = load_dataset("ccdv/arxiv-summarization")
# 50k training samples, 12.5k validation samples
dataset = load_dataset("ccdv/arxiv-summarization", split=['train[:50000]', 'train[50000:62500]'])

In [None]:
dataset

In [None]:
from datasets import DatasetDict

small_dataset = DatasetDict()
small_dataset["train"] = dataset[0]
small_dataset["validation"] = dataset[1]

In [None]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

## Specify the model checkpoint from which to fine-tune

In [None]:
model_checkpoint = "facebook/bart-large"
model_name = f"{model_checkpoint.split('/')[-1]}-arxiv-sum"

## Preprocess the data

1. Load BartTokenizerFast to tokenize & encode the texts. Pad to the right.
2. Truncate each document to 2048 tokens and each abstract to 256 tokens.
3. Define preprocessing function to tokenize, truncate and arrange data properly.
4. Map preprocess function to each sample from data.

In [None]:
print_gpu_utilization()

In [None]:
""" Step 1: Load BartTokenizerFast to tokenize & encode the texts. Pad to the right """
from transformers import AutoTokenizer, BartTokenizerFast

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# tokenizer = BartTokenizerFast.from_pretrained(model_checkpoint)

print(tokenizer("This tokenization thing is pretty cool!"))

In [None]:
print_gpu_utilization()

In [None]:
""" Step 2: Truncate each document to 2048 tokens and each abstract to 256 tokens """

max_input_length = 512
max_target_length = 256

""" Step 3: Define preprocessing function to tokenize, truncate and arrange data properly """

def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["article"],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(
        examples["abstract"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
""" Step 4: Map preprocess function to each sample from data """

tokenized_datasets = small_dataset.map(preprocess_function, batched=True)

In [None]:
tokenized_datasets['train'].features

In [None]:
#dataset_name = 'arxiv-summarization-small-tokenized'
tokenized_datasets.push_to_hub(repo_id=dataset_name, private=True)

## Setup for Training

1. Load the model (BartForConditionalGeneration)
2. Set arguments & hyperparameters
3. Specify logic for computing ROUGE scores
3. Instantiate Data Collator
4. Login to HF to log training and push to HF-Hub
5. Evaluate performance prior to fine-tuning
6. Instantiate Trainer object
7. Start training
8. Evaluate performance post fine-tuning
9. Tag as a Summarization model and push to HF Hub

In [None]:
print_gpu_utilization()

In [None]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
""" Step 1: Load the model (BartForConditionalGeneration) """

from transformers import AutoModelForSeq2SeqLM, BartForConditionalGeneration

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
# model = BartForConditionalGeneration.from_pretrained(model_checkpoint)

In [None]:
print_gpu_utilization()

In [None]:
""" Step 2: Set arguments & hyperparameters """

from transformers import Seq2SeqTrainingArguments

batch_size = 32
num_train_epochs = 20
# Show the training loss with every epoch
logging_steps = len(tokenized_datasets["train"]) // batch_size

args = Seq2SeqTrainingArguments(
    output_dir=model_name,
    evaluation_strategy="steps",
    eval_steps=
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01, 
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    push_to_hub=True,
    fp16=True,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    run_name=model_name
)

In [None]:
""" Step 3: Specify logic for computing ROUGE scores """

import numpy as np
from datasets import load_metric
from nltk.tokenize import sent_tokenize

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    rouge_score_metric = load_metric('rouge')
    result = rouge_score_metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the median scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
""" Step 4: Instantiate Data Collator """

from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
""" Step 5: Login to HF to log training and push to HF-Hub """


from huggingface_hub import notebook_login

notebook_login()

In [None]:
""" Step 6: Instantiate Trainer object """

from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
""" Step 7: Evaluate performance prior to fine-tuning """

# trainer.evaluate()

In [None]:
print_gpu_utilization()

In [None]:
""" Step 8: Start training """

result = trainer.train()

In [None]:
# print_summary(result)
print_gpu_utilization()

In [None]:
""" Step 9 : Evaluate performance post fine-tuning """

trainer.evaluate()

In [None]:
""" Step 10: Tag as a Summarization model and push to HF Hub """

trainer.push_to_hub(commit_message="Training complete", tags="summarization")

## Run inference with newly fine-tuned model

In [None]:
TEXT = """Real-time GPU-accelerated Driver Assistance System
Pooja Ravi, Aditya Shukla, and B. Muruganantham
Department of Computing Technologies, SRM Institute of Science and Technology, Kattankulathur, Chennai, India
{kuchan2001, adityashukzy}@gmail.com muruganb@srmist.edu.in
Abstract. We present an extensive driver assistance system capable of executing two essential tasks. The first module is used in assisting the driver with road safety alerts (RSA); it scans the environment us- ing a camera attached to the front bonnet of the vehicle and detects any significant entities including but not limited to vehicles, pedestri- ans, and traffic lights. For this module, we also propose the usage of a compute-accelerated Swin Transformer model and evaluate its efficacy against other state-of-the-art models by considering relevant metrics like inference time and mAP. The second module pertains to driver alertness detection (DAD) for identifying signs of fatigue. It scans the driver’s face and monitors a live-video feed to ensure that the driver shows no signs of micro-sleep. When either module detects a behavioural anomaly, it will alert the driver with text-based messages and non-disruptive audio mes- sages. We propose such a state-of-the-art safety system being integrated into the advanced driver assistance systems (ADAS’s) seen in modern vehicles.
Keywords: Road entity tracking · Facial behavior monitoring · Real- time live video analysis · Object Detection · ADAS
1 Introduction
Ensuring the safety and alertness of drivers is a task of utmost importance. A large number of the traffic accidents that take place can be mitigated by the drivers being made alert and aware of the things happening on the road. While car accidents are often due to rule-breaking and over-speeding, more devastating accidents (caused by heavier vehicles such as trucks, trailers and lorries) are often due to overworked and sleep-deprived drivers. A system that can both alert and assist drivers on the road can help mitigate this problem. Hence, we propose a real-time driver assistance system that will equip drivers of four-wheel vehicles with some essential safety measures.
Various road entities such as crossing pedestrians, vehicles in proximity, traf- fic signals, etc. are potential causes of accidents if not paid due attention towards. Such mishaps usually occur as a result of drowsy drivers, or due to a lack of con- scious awareness of the objects moving in and around the driver’s line of sight.

2 P. Ravi & A. Shukla & B. Muruganantham
Fig. 1. Conceptual Diagram for Driver Assistance System
To aid the driver in identifying such obstacles, we present an object detection mechanism with the ability to identify several classes of obstacles and possible hindrances on the road including vehicles, pedestrians, motorcyclists, bikers, as well as larger vehicles such as buses and trucks. Additionally, we compute an estimate of the distance between our vehicle and other detected cars or pedes- trians. If an object comes closer than 15 feet, we propose alerting the driver via a non-disruptive audio message. This is the work of the RSA module. Such measures can help the driver foresee any plausible mishaps and regain control of the vehicle in a timely manner.
Coming to inside the vehicle, the DAD module tracks the driver’s behavior and alerts them in a similar non-disruptive manner if they get drowsy. The foremost signs of drowsiness include yawning and droopy eyes.
While the RSA module scans the external environment, the DAD module keeps an eye on the internals i.e. the driver. These two modules, hence, work in tandem.
2 Related Works
Previous literature on this subject has introduced various algorithms for detect- ing drowsiness, yawns as well as for localizing & classifying driver behavior or distraction. We have also seen various approaches explored for integrating such safety systems into ADAS’s and vehicles in general.
In [4], the authors propose a drowsiness detection system on the Android interface wherein they combine the face detection and drowsiness identification module with a smartphone interface. Once the level of drowsiness is detected, they sound an alarm along with visual alerts. They detect yawns, blinks, head movements and signs of distractions. The authors of [4] also consider some special case scenarios in which the driver wears glasses or has hair covering their face. They calculate the number of correct predictions with respect to ground truth values and achieve an average hit rate of 93.37%.
 
Real-time GPU-accelerated Driver Assistance System 3
The authors of [12] propose and compare two different approaches to detect- ing drowsiness among drivers: the first being to use a combination of recurrent and convolutional neural networks to capture the temporal aspect, and the sec- ond, to use features from images which are sent to a fuzzy classifier for obtaining predictions. As per their claims, the fuzzy system outperforms the other algo- rithm due to the reduced presence of false positives, helping achieve a high specificity of 93%. They use a combination of the GRU [2] and EfficientNet [11] architectures to ensure that a certain number of frames is stored in the model’s memory so that accurate predictions can be obtained for the current frame. Fur- ther, a CNN system with a fuzzy triangular function is also utilized to assess blinks, yawns and sleep. They achieve an average accuracy of 55.4% and 52% for the first and second methods respectively on the testing data.
In the work by [12], OpenCV [1] and the Dlib library [5] are employed to detect instances of drowsiness among drivers. They find the aspect ratio and establish a threshold which, if crossed, will cause an alert to be issued. Once the facial landmarks are localized successfully, their algorithm calculates an aspect ratio and sounds an alarm if the counter increases beyond a certain set limit. They also test the robustness of their proposed method and quantify how it behaves under special circumstances such as bespectacled faces and added noise.
In [6], the authors propose a CenterNet [13] based convolutional network architecture wherein they include modifications to help with optimized down- sampling and spatial pyramid pooling. They obtain key point estimates and use output layers similar to [13] for producing results. The usage of atrous convo- lutions and lower image resolutions helps the authors of [6] to achieve compu- tational optimality. They fulfill the object of road entity detection using their modified ASPP-CenterNet and achieve an AP of 70.6 on small objects.
Companies such as Tesla and Waymo use more sophisticated techniques such as 3D object detection, semantic segmentation and monocular depth estimation for capturing the road environment.
While the aforementioned techniques have one specific focus, our work’s ob- jective is manifold and involves optimized computations, entity detection, dis- tance estimation while also ensuring driver’s alertness. Because some of the pri- mary concerns in vehicular ADAS systems are the memory & energy footprints, we strive to achieve maximum performance with limited resources.
3 Methods
We shall classify our discussions regarding the methods as per the module under which they are proposed to be implemented, namely RSA and DAD.
3.1 Road Safety Alerts (RSA) – Obstacle Localization
Road entity detection and tracking The task of detecting objects has been assigned to the Swin Transformer [8], a state-of-the-art architecture that excels in object detection and classification tasks. By making use of the publicly available

4 P. Ravi & A. Shukla & B. Muruganantham
Udacity Self Driving Dataset (which consists of a slew of images with diverse terrains, weather conditions, and times of day), we propose the usage of the Swin [8] model to precisely detect various road entities. We further compare the performance of Swin with several variants of the popular YOLO object detection model so as to provide an idea of how our proposed detection and distance estimation methods outperform pre-existing standards.
Swin [8] is a transformer architecture that suits various computer vision tasks such as object detection, image classification and object recognition. It is a successor of the Vision Transformer [3] and its core architecture consists of various transformer blocks and their corresponding patch-merging blocks.
The initial patch partition block converts the input image into 4x4 non- overlapping patches and the linear embedding layer converts the channels of the patches to a vector of size C. Further, the image is passed through successive patch merging and Swin Transformer blocks. The patch merging modules help combine the features and amplify the output channels.
Fig. 2. Obstacle Identification
This Swin architecture, trained on the COCO dataset [7], has been employed as the base for performing accurate object detection; an example of its perfor- mance can be seen in Figure 2. The bounding box values predicted by Swin (and passed downstream) provide the basis for estimating the distance of the detected entities (pedestrians, trucks, other vehicles, etc.) from the driver’s vehicle.
In Figure 3, we have provided a visualization of Swin’s learning process and how it localizes bounding box pixel features for further assigning its predictions.
Distance Estimation Once the model has tracked the objects present in a frame, the natural next step is to develop some heuristic to sift through the objects and alert the driver to any significant entities. This may be other vehicles such as cars, trucks, bikers, or even pedestrians crossing the street.
The method used to approximate the distance of an entity from the camera is as follows: say an object is present in an image. Let its real-world width be W units and its apparent width (i.e. that in the image) be P pixels. Let us also say that the object is present at a distance of D units from the camera.
Now, we can compute a value called the perceived focal length F as follows:
 
Real-time GPU-accelerated Driver Assistance System 5
 Fig. 3. Masked Pixel Representation - Swin Transformer
F = WP × D (1)
The above method leverages the property of Triangle Similarity.
This value holds insight on how the size of an object scales to its size in an image. We can intuit that this has something to do with its real size, its apparent size, and how far it is from the camera. Thus, given a scenario where we do not know the 3rd factor, we can work out a guess for how far the object is if we know the first two.
Taking the example of cars, not all models and brands of cars have the same width, but we can come up with a fairly accurate ballpark: most commercial-use cars such as sedans and hatchbacks are about 5.8ft wide. This value can serve as the real width of the object.
Fig. 4. Distance Estimation
Now, given an image where our model has detected and put a bounding box around a car, we can very easily compute the width of that car in pixels in the image. This value can serve as the apparent width of the object.
Finally, using the below formula derived from 1, we can compute the dis- tance D (in ft.), of the object from the camera:
D=WP ×F (2)
where, W - real width of object (in ft.), P - perceived width of object (in pixels), F - perceived focal length (computed separately beforehand).
 
6 P. Ravi & A. Shukla & B. Muruganantham
Note that the value of F will be different for each class that our model detects. This is because each class’s object has a different average length, and hence the value of F is computed manually beforehand for each class.
An example of the above method applied to an image is in Figure 4.
3.2 Driver Alertness Detection (DAD) – Facial Landmark tracking
We employ the use of facial landmarks and mathematical distance norms (such as the Euclidean distance) to track aspects like drowsiness and yawning for a driver. The requirements for the same include a functioning camera fitted inside the vehicle for collecting an input video stream of the driver’s face, and a pipeline to forward this incoming data stream to a processing device that will run our proposed algorithm.
Google’s open-source Mediapipe Face Detection Framework [9] helps localize facial landmark points in the form of a 3D mesh on every human subject ap- pearing before the camera. 468 local points are identified on an individual’s face and are numbered distinctly to aid with facial behaviour tracking.
The drowsiness and yawn detection algorithms work as independent mod- ules on real-time camera input. These algorithms can be scaled across hardware regardless of the configuration and quality of equipment used.
Fig. 5. Ratio of Eyelid distance
Eye Motion Tracking The OpenCV library [1] of Python plots the marked facial points for the upper and lower eyelids of both eyes by making use of the aforementioned Mediapipe Framework [9] for assigning 3-dimensional landmarks throughout the facial surface appearing on the frame. The tracking algorithm then converts the relevant predicted facial points to Numpy arrays by indexing the required landmarks. The arrays corresponding to the eye region are then unraveled to obtain the coordinates for the landmarks pertaining to the eyes.
Having obtained the eye-level data, an Eye Aspect Ratio (EAR) metric is calculated independently for each eye by using the Euclidean Distance norm. The distance is calculated between the extreme coordinates of the eye, namely, the horizontal extremes across the eye and vertical extremes down the middle. This allows us to estimate the linear distance between the desired landmarks.
 
Real-time GPU-accelerated Driver Assistance System 7
Subsequently, we can divide the horizontal distance by the vertical distance to obtain a ratio. In this way, each time the human appearing in the frame blinks, the ratio tends towards infinity indicating that the eyes are closed.
This gives us a final heuristic in the form of a ratio metric that sheds light on whether the eye is closed or not. A set threshold of 30 frames is the criterion for ascertaining whether the driver is indeed micro-sleeping (which usually lasts up to 15 seconds).
Eye Aspect Ratio (EAR) = horizontal eyelid distance (3) vertical eyelid distance
Additionally, we have included an audio alert system which will trigger an alarm upon observing recurring micro-sleep patterns so that the driver can be audibly alerted in case a visual text message is inadequate. The alarm is played at regular intervals upon detecting micro-sleep to ensure that it fulfills the role of keeping the driver attentive.
Hence, our proposed system will help detect early signs of danger, and ac- cordingly provide audio warnings to alert the driver. Whether the driver wishes to stop and rest, or continue the rest of the journey (albeit at their own risk) is up to their own discretion.
Lip Tracking Yawns are a significant and widely noticeable symptom of micro- sleep, and therefore an immediate detection and alert system will help prevent possible accidents as a consequence. The lips are detected using the Dlib facial landmark algorithm [5]. It localizes a number of landmarks across the human face, 68 to be exact. Much like in Mediapipe, we can then access data pertaining to the mouth and lips by indexing the landmark arrays relevant to that region.
Fig. 6. Dlib Facial Landmarks
The detected lips are denoted by points 49 through 68 in Figure 6. Much like for eyes, we need a metric to approximate the distance between the upper and lower lips (which will indicate a yawn). Thus, we loop through the coordinates pertaining to the upper and lower outlines of the upper lip, as well as those for the lower lip. We then find a mean of the values for the upper lip, and similarly
  
8 P. Ravi & A. Shukla & B. Muruganantham
for the lower lip. The final metric is a modulus of the distance between these two means.
Euclidean Distance This metric is used to find the distance between two given points in the coordinate plane. For the x- and y-coordinates of the points, their differences are squared and summed. The square root of this value produces the final distance between the given points as in 4.
q22
d= (x1−x2) +(y1−y2) (4)
 4 Experiments
4.1 Dataset
The pre-trained checkpoints used by us for the Swin and YOLO models have been trained on the COCO dataset. COCO is a large-scale image dataset used for a variety of computer vision tasks such as object detection, image segmentation, and captioning. Although COCO has several classes, the ones pertaining to our interest are car, person, bicycle, motorcycle, bus, truck, and traffic light.
Additionally, we also tested the pre-trained model for inference on the afore- mentioned Udacity Self Driving Dataset. Some of the model predictions on im- ages from the Udacity dataset can be found in Figure 7.
4.2 Transfer Learning
We have made use of the variant of Swin pre-trained on the COCO dataset [7]. The foremost reason for adopting this technique is that COCO [7] already in- cludes all the classes of road entities that we wish to detect, making it ideal for our use case. The other important reason why is that the results established by the pre-trained model only help us further ensure that predictions are accurate. Hence, we mitigate the possibility of putting the driver in jeopardy due to inad- equate real-time performance. Thus, the process of displaying road safety alerts is accelerated by using accurate and speedy predictions, as well as instantaneous delivery of live updates.
4.3 Metrics
F1 Score The F1 score is a highly balanced evaluation metric as it combines both the Precision and Recall values to obtain a single comprehensive score which can be used to evaluate any algorithm’s performance while not compromising on either precision or recall.
Precision is the measure of how many frames have been correctly classified as belonging to the positive class. Recall is the measure of how many actually positive labels have also been predicted as such. F1-score is the harmonic mean of the precision and recall values.

Real-time GPU-accelerated Driver Assistance System 9
F1 Score = 2 × P × R (5) P+R
mean Average Precision (mAP) The area under the precision-recall curve is termed as the average precision and is used as a performance evaluation metric for object detection tasks. This metric compares the ground truth bounding box to that of the model’s predictions and the returned value indicates how closely the two boxes overlap. This is calculated for each output box produced by model for every available class.
AP =X(Rn −Rn−1)×Pn (6) n
The mAP value is simply the mean of all average precision values obtained by summing and dividing by the total number of samples provided to the model.
mAP = 1c × X Average P recisionc (7) c
Here, C represents the number of classes present in the image.
Specifically, we make use of the map@[0.5:0.95] which signifies a range of confidence thresholds from 0.5 (lowest) all the way to 0.95 (highest) for better analysing model performance in different scenarios.
Inference Times One significant metric that we rely on to advocate the usage of our method is the inference time observed when the Swin Transformer [8] is used to obtain predictions for any image supplied.
Inference Time = End Time − Start Time (8) Number of Images
Here, the Start and End times correspond to the time interval for which the inference function runs.
4.4 Results
We experimented with various state-of-the-art model architectures prevalent in object detection including the aforementioned Swin Transformer [8] and other object detection models from the much acclaimed YOLO family. We made use of transfer learning by using the pre-trained weights of each of these models trained on the popular COCO dataset [7], which contains several classes of interest to us, such as cars, bikers, pedestrians, and traffic lights.
The Swin-T model has been employed for inference purposes and relevant re- sults have been demonstrated along with corresponding inference times in Table 1 and other state-of-the-art YOLO object detection models [10] have been used for comparison purposes. Both Non-GPU and GPU-accelerated models have been considered for inference purposes.
  
10 P. Ravi & A. Shukla & B. Muruganantham
Fig. 7. Bounding box Predictions
Fig. 8. Average Inference Time vs. mAP (with and without GPU) for different models
The graph given in Figure 8 depicts the comparison of inference times for GPU and non-GPU model performance. Upon running inference using various object detection models, we observed that the GPU-accelerated Swin Trans- former provides highly optimized inference times and mAP values to account for computational efficiency while maintaining commendable mAP values.
Fig. 9. Results of Drowsiness module
The images displaying necessary alerts for the DAD module and aspect ratios are attached in Figure 9 along with the frames per second used while passing them through our algorithm. They are indicative of the DAD module’s perfor- mance in real-time for any surrounding regardless of the atmospheric setting. When the driver is drowsy and a certain frame threshold is crossed, the alert pops up on screen as can be seen in Figure 9 along with an audio alert. If and when the yawn counter registers a yawn, non-disruptive alerts are also issued.
Furthermore, we plotted the confusion matrix for the DAD module using our algorithm’s predictions for 30 arbitrary frames obtained at regular intervals. Each of these frames was manually labelled to obtain the ground truth values. The confusion matrix shown in Table 2 is the depiction of how well our DAD algorithm performs in real-time.
   
Real-time GPU-accelerated Driver Assistance System 11
Table 1. Inference Details (confidence threshold = 75%) Architecture No. of Parameters (millions) mAP (0.5:0.95) Avg. Inference Time Per Image (seconds)
 Model
Swin Transformer YoloX
Yolov5 YoloR YoloV6
Swin-T YoloX-small YoloX-medium Yolov5-small Yolov5-medium YoloR-E6 YoloR-D6 Yolov6-s Yolov6-n
28 46.0 8.97 40.5 25.3 46.9 7.2 37.4 21.2 45.4 17.1 39.9 21.8 40.4 17.2 43.1 4.3 30.8
With GPU
0.0193 0.018 0.087 0.013 0.064 0.038 0.026 0.042 0.066
Without GPU
– 0.67 1.55 0.38 0.96 0.98 0.94 0.38 0.95
           Table 2. Confusion Matrix - Alertness Detection
 Module Labels
True Positive True Negative
 Micro-sleep Predicted Positive 15 5 Predicted Negative 3 7 Yawns Predicted Positive 17 3 Predicted Negative 2 8
    From the confusion matrix, we have also computed the precision, recall, F1 score and accuracy for the DAD module in Table 3 as this helps us evaluate how robust our algorithm is.
5 Conclusion
The main aim of our work is to strike a necessary balance between memory effi- ciency, inference time, and accuracy. For road entity detection, the Swin Trans- former [8] excels and outperforms all other established state-of-the-art models.
Further, we make use of the 2 modules, RSA and DAD, by effectively pooling results from both. This integration of the two modules further goes to corrobo- rate the final decisions. This is because we are taking into account more points of reference, evaluating safety concerns from different perspectives, and finally obtaining a single-valued score for creating a more comprehensive user interface.
Such a pipeline facilitates a system that is user-friendly and minimally dis- tracting when it comes to driver assistance.
6 Future Works
The method proposed in this paper requires certain scarcely available resources like a GPU, possible IOT integration inside the car and a good quality camera to capture live-stream input data. This ensures the efficient usage of all the features highlighted in the paper. While the distance estimation method used by us is an approximation of the actual distance, it serves the purpose of coming up with a ballpark value so as to be able to judge whether an entity is too close. There are several other methods to perform distance estimation as well, varying in complexity and efficacy.
Technologies like IOT, V2V, and V2I networks have established a stronghold in the research surrounding driver assistance systems. The integration of such systems with the one outlined in this paper may also prove to be an exciting step in establishing a holistic ecosystem.

12 P. Ravi & A. Shukla & B. Muruganantham
Table 3. Results - DAD Module
 Module
Micro-Sleep Tracker Yawn Detection
Precision Recall F1 Score Accuracy
0.833 0.770 0.800 0.733 0.850 0.895 0.872 0.833
   References
1. Bradski, G.: The OpenCV Library. Dr. Dobb’s Journal of Software Tools (2000)
2. Cho, K., van Merrienboer, B., Bahdanau, D., Bengio, Y.: On the properties of neural machine translation: Encoder-decoder approaches. CoRR abs/1409.1259
(2014), http://arxiv.org/abs/1409.1259
3. Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner,
T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., Uszkoreit, J., Houlsby, N.: An image is worth 16x16 words: Transformers for image recognition at scale. In: In- ternational Conference on Learning Representations (2021), https://openreview. net/forum?id=YicbFdNTTy
4. Galarza, E., Egas, F., Silva, F., Velasco, P., Galarza, E.: Real Time Driver Drowsi- ness Detection Based on Driver’s Face Image Behavior Using a System of Hu- man Computer Interaction Implemented in a Smartphone, pp. 563–572 (01 2018). https://doi.org/10.1007/978-3-319-73450-7_53
5. King, D.E.: Dlib-ml: A machine learning toolkit. Journal of Machine Learning Research 10, 1755–1758 (2009)
6. Li, G., Xie, H., Yan, W., Chang, Y., Qu, X.: Detection of road objects with small appearance in images for autonomous driving in various traffic situations using a deep learning based approach. IEEE Access 8, 211164–211172 (2020). https: //doi.org/10.1109/ACCESS.2020.3036620
7. Lin, T., Maire, M., Belongie, S.J., Bourdev, L.D., Girshick, R.B., Hays, J., Perona, P., Ramanan, D., Dollár, P., Zitnick, C.L.: Microsoft COCO: common objects in context. CoRR abs/1405.0312 (2014), http://arxiv.org/abs/1405.0312
8. Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., Guo, B.: Swin transformer: Hierarchical vision transformer using shifted windows. International Conference on Computer Vision (ICCV) (2021)
9. Lugaresi, C., Tang, J., Nash, H., McClanahan, C., Uboweja, E., Hays, M., Zhang, F., Chang, C.L., Yong, M., Lee, J., Chang, W.T., Hua, W., Georg, M., Grund- mann, M.: Mediapipe: A framework for perceiving and processing reality. In: Third Workshop on Computer Vision for AR/VR at IEEE Computer Vision and Pat- tern Recognition (CVPR) 2019 (2019), https://mixedreality.cs.cornell.edu/ s/NewTitle_May1_MediaPipe_CVPR_CV4ARVR_Workshop_2019.pdf
10. Redmon,J.,Divvala,S.K.,Girshick,R.B.,Farhadi,A.:Youonlylookonce:Unified, real-time object detection. CoRR abs/1506.02640 (2015), http://arxiv.org/ abs/1506.02640
11. Tan, M., Le, Q.V.: Efficientnet: Rethinking model scaling for convolutional neural networks. CoRR abs/1905.11946 (2019), http://arxiv.org/abs/1905.11946
12. Zaki, A., Mohammed, E., Aaref, A.: Real-time driver awareness detection system. IOP Conference Series: Materials Science and Engineering 745, 012053 (03 2020). https://doi.org/10.1088/1757- 899X/745/1/012053
13. Zhou, X., Wang, D., Krähenbühl, P.: Objects as points. CoRR abs/1904.07850 (2019), http://arxiv.org/abs/1904.07850
"""

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model=model_name)
summary = summarizer(TEXT[:1000], min_length=100, max_length=256, do_sample=False)[0]['summary_text']

print(summary)