In [20]:
import os
import cv2
from ultralytics import YOLO
import findspark
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql.functions import col, substring, count, when
import findspark
from pyspark.sql.functions import col, when, max as spark_max, regexp_extract
from pyspark.sql.window import Window
from pyspark.sql import functions as F

In [71]:
def logger(model_path, root_dir, images_folder, output_csv):

    model = YOLO(model_path)
    
    data = []

    for images in images_folder:
        
        date = images[6:16]
        shoot = images[17:25]
        
        frame = int(images[26:-4])
        
        image_path = os.path.join(root_dir, images)
        
        predicts = model.predict(image_path)


        for pred in predicts:
            bboxes = pred.boxes

            bbox = bboxes.xyxy.tolist() 
            if len(bboxes.cls) != 0:
                    
                data.append((images, date,shoot,frame,  bbox, 'miss'))
            else:
                data.append((images, date,shoot,frame,  bbox, 'broken'))



    df = pd.DataFrame(data, columns=['images', 'date','shoot','frame', "bbox_loc", "miss_or_broken"])
        
    df.to_csv(output_csv, index=False)
    

In [72]:
model_path = r'C:\Users\yaros\OneDrive\Рабочий стол\MLBD LAB3\best.pt'
root_path = r'C:\Users\yaros\OneDrive\Рабочий стол\MLBD LAB3\extracted1'
images_folder = os.listdir(root_path)
output_csv = 'finally.csv'

logger(model_path, root_path, images_folder, output_csv)


image 1/1 C:\Users\yaros\OneDrive\ \MLBD LAB3\extracted1\frame_02-01-2024_SHOT0001_0.jpg: 384x640 1 miss, 775.0ms
Speed: 4.0ms preprocess, 775.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 C:\Users\yaros\OneDrive\ \MLBD LAB3\extracted1\frame_02-01-2024_SHOT0001_141.jpg: 384x640 1 miss, 764.0ms
Speed: 2.0ms preprocess, 764.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 C:\Users\yaros\OneDrive\ \MLBD LAB3\extracted1\frame_02-01-2024_SHOT0001_47.jpg: 384x640 1 miss, 764.0ms
Speed: 2.0ms preprocess, 764.0ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 C:\Users\yaros\OneDrive\ \MLBD LAB3\extracted1\frame_02-01-2024_SHOT0001_94.jpg: 384x640 1 miss, 763.0ms
Speed: 2.0ms preprocess, 763.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 C:\Users\yaros\OneDrive\ \MLBD LAB3\extracted1\frame_02-01-2024_SHOT0002_0.jpg: 384x640 (no detections), 744.0ms
Speed: 4.0ms preproc

In [85]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lag
from pyspark.sql.window import Window



spark = SparkSession.builder.appName("LongestSeries").getOrCreate()

df = spark.read.csv(output_csv, header=True)


df = df.withColumn("bbox_loc", when(col("bbox_loc").isNull(), None).otherwise(F.regexp_replace(col("bbox_loc"), "[\[\]]", "")))
df = df.withColumn("bbox_loc", F.split(col("bbox_loc"), ", ").cast("array<float>"))


w = Window.partitionBy("date").orderBy("frame")
df = df.withColumn("prev_miss_or_broken", lag("miss_or_broken").over(w))


df = df.withColumn("series", when((col("miss_or_broken") == "miss") & (col("prev_miss_or_broken") != "miss"), 1).otherwise(0))
df = df.withColumn("series", F.sum("series").over(w))


df = df.filter(col("miss_or_broken") != "miss")

result = df.groupBy("date").agg(F.max("series").alias("max_series_length"))


result.show()




+----------+-----------------+
|      date|max_series_length|
+----------+-----------------+
|02-01-2024|               53|
|02-07-2024|               59|
|02-18-2024|               74|
|02-21-2024|               56|
|03-01-2024|               49|
|03-07-2024|               28|
|03-14-2024|                6|
|03-24-2024|               61|
|03-28-2024|               68|
+----------+-----------------+

