I also tried predicting the video elements using OpenCV, as it requires less computational power and offers good performance. However, I preferred using a 2D CNN due to its approximately 10% higher accuracy

In [1]:
import cv2
import os
import pandas as pd

def load_video(video_path):
    # Load the video and extract a few frames to inspect
    cap = cv2.VideoCapture(video_path)
    frames = []

    for i in range(20):
        ret, frame = cap.read()
        if ret:
            frames.append(frame)
        else:
            break

    cap.release()
    return frames

In [2]:
import numpy as np

def preprocess_frame(frame):
    """
    Preprocess the video frame to isolate the main object and remove noise.
    """
    # Convert the frame to grayscale
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    
    # Threshold the grayscale image to binary (object vs background)
    _, thresh = cv2.threshold(gray, 40, 255, cv2.THRESH_BINARY)
    
    return thresh

def find_contours(processed_frame):
    """
    Find contours in the processed frame and draw them on the original frame.
    """
    # Find contours from the binary image
    contours, _ = cv2.findContours(processed_frame, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    # Filter contours by area (ignore small ones)
    filtered_contours = [cnt for cnt in contours if cv2.contourArea(cnt) > 50]
    
    return filtered_contours

def predict_shape(video_path):
    frames = load_video(video_path)
    # Preprocess the first frame
    processed_frame = preprocess_frame(frames[10])
    # Analyze the contours in the first frame
    contours = find_contours(processed_frame)
    shape = "None"
    for i, contour in enumerate(contours):
        epsilon = 0.03*cv2.arcLength(contour, True)
        approx = cv2.approxPolyDP(contour, epsilon, True)
        if len(approx) == 3:
            shape = "Boden"
        elif len(approx) == 4:
            shape = "Feuer"
        elif len(approx) == 5:
            shape = "Wasser"
        elif len(approx) == 6:
            shape = "Luft"
        else:
            shape = "Erde"
    
    print(len(contours))
    print(shape)
    return shape

In [6]:
video_path = 'BH25/Testing_Data/3.mp4'
predict_shape(video_path)

1
Wasser


'Wasser'

In [7]:
test_dir = 'BH25/Testing_Data'
elements = []
video_ids = []
for video_id in sorted(
    [f for f in os.listdir(test_dir) if f.endswith(".mp4")], key=lambda x: int(x.split('.')[0])):
    video_path = os.path.join(test_dir, video_id)
    video_ids.append(video_id.split('.')[0]) 
    elements.append(predict_shape(video_path))

1
Boden
1
Boden
1
Wasser
1
Feuer
1
Luft
1
Feuer
1
Boden
1
Luft
1
Feuer
1
Boden
1
Boden
1
Boden
1
Boden
1
Wasser
1
Erde
1
Erde
1
Luft
1
Feuer
1
Wasser
1
Feuer
1
Luft
1
Boden
1
Erde
1
Erde
1
Feuer
1
Luft
1
Luft
1
Luft
1
Erde
1
Luft
1
Feuer
1
Erde
1
Wasser
1
Feuer
1
Wasser
1
Luft
1
Erde
1
Erde
1
Erde
1
Erde
1
Feuer
1
Wasser
1
Luft
1
Boden
1
Erde
1
Luft
1
Erde
1
Luft
1
Wasser
1
Wasser
1
Boden
1
Boden
1
Boden
1
Boden
1
Wasser
1
Feuer
1
Luft
1
Feuer
1
Erde
1
Boden
1
Boden
1
Luft
1
Luft
1
Erde
1
Erde
1
Wasser
1
Luft
1
Feuer
1
Boden
1
Boden
1
Erde
1
Erde
1
Boden
1
Luft
1
Feuer
1
Erde
1
Boden
1
Erde
1
Luft
1
Luft
1
Feuer
1
Feuer
1
Erde
1
Wasser
1
Boden
1
Wasser
1
Erde
1
Boden
1
Feuer
1
Erde
1
Wasser
1
Luft
1
Boden
1
Erde
1
Luft
1
Boden
1
Feuer
1
Wasser
1
Erde
1
Boden
1
Luft
1
Feuer
1
Luft
1
Boden
1
Erde
1
Erde
1
Luft
1
Boden
1
Luft
1
Erde
1
Feuer
1
Boden
1
Boden
1
Luft
1
Wasser
1
Boden
1
Boden
1
Feuer
1
Wasser
1
Boden
1
Feuer
1
Luft
1
Erde
1
Boden
1
Boden
1
Luft
1
Luft
1
Feuer
1
Wasser
1
Feuer


In [8]:
df = pd.DataFrame({'video_id': video_ids, 'element': elements})

In [9]:
df.head()

Unnamed: 0,video_id,element
0,1,Boden
1,2,Boden
2,3,Wasser
3,4,Feuer
4,5,Luft


Taken the reference from the following video:

video link : [Link](https://youtu.be/Wl11eloYVm8?si=_xVghF5-8Kt1FXUc)