# JieShen Testing
Original -> ROIs -> Training

We need these steps:
1. Human Detection/Segmentation, seperation of ROIs from frames (With YOLO/YOLO-obb)
2. Human Tracking (With SORT)
3. Drowning Detection (With our own model e.g. ViT, CKAN, ResNet..... all CNN-based)
4. Compute Running Threshold (y_t = \alpha y_t + (1 - \alpha) y_(t-1))
5. Draw results

But most of these steps does not involve any classical image processing steps, so maybe we need to include something like
1. Histrogram Equalisation (To remove the blue background, but there are also pretrains can do this)
2. Posture Estimation (But not sure how it helps)

Do notice that the ground truth given is in [Prediction_Outcome, Bounding Box Coordinates]

YeeJing
- Say YOLO not good, using nano, suggesting to train from scratch [Suggest to fine-tune by loading the pretrained YOLOs]
- In terms of drowning classifier, suggesting to train YOLO-like from scratch [ViT, CKAN, ResNet, YOLO-like...]
- How to classify in actual stream deployment
- Say not much processing can be done on images, examples like BGR2HSV, resizing, histogram equilisation (focusing on removing/weakening blue channel) [Image Jittering, Rotation]

# TODO:
1. fine-tune YOLO for human detection
2. Try out different preprocessing techniques
3. Train different models
4. Evaluation of models
5. DEPLOYMENT

Predicted
t_0 = 0
t_1 = 0.5
t_2 = 0.9
t_3 = 1
t_4 = 0.3

Display (With alpha = 0.5, alpha is the sensitivity in layman terms)
The equation: y_t = \alpha y_t + (1 - \alpha) y_(t-1)

t_0 = 0
t_1 = (1 - 0.5) * 0.5 + 0.5 * 0 = 0.25
t_2 = (1 - 0.5) * 0.9 + 0.5 * 0.25 = 0.6
t_3 = (1 - 0.5) * 0.6 + 0.5 * 1 = 0.8

t value is used also to determine the color of the bounding box. [0 - 1]->[Green - Red]
Example: If t > 0.7, then emit sound


CNN-Based Model:
Conv2d
Conv2d
Pooling
Flatten()
Softmax/Sigmoid

-> Probability













# Sample by Copilot down here

Below are instructions along with code cells you might add to your notebook. Create new code cells in your current ipynb and paste each code block:

Cell 1 – Import libraries:


Sure, here's a proposed directory structure for a Python Deep Learning project:



Create Workspace...

In [None]:
import cv2
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense



Cell 2 – Define and compile your CNN:


In [None]:
def build_cnn_model(input_shape):
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        MaxPooling2D((2, 2)),
        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Flatten(),
        Dense(128, activation='relu'),
        # Binary classification: drowning (1) vs normal (0)
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Initialize your model
input_shape = (64, 64, 1)  # For grayscale images of size 64x64
model = build_cnn_model(input_shape)
print("CNN Model built successfully!")



Cell 3 – Process video and run detection:


In [None]:
def process_video():
    # Create a background subtractor
    backSub = cv2.createBackgroundSubtractorMOG2(history=500, varThreshold=25, detectShadows=True)
    cap = cv2.VideoCapture(0)  # Use a webcam. For a video file, provide the file path.

    while True:
        ret, frame = cap.read()
        if not ret:
            print("Frame capture failed, exiting...")
            break

        # Apply background subtraction to extract foreground
        fgMask = backSub.apply(frame)
        
        # Resize foreground mask to the required input size for the CNN
        resized = cv2.resize(fgMask, (64, 64))
        
        # Preprocess: normalize and reshape for the CNN
        img_input = resized.astype('float32') / 255.0
        img_input = np.expand_dims(img_input, axis=-1)  # add channel axis
        img_input = np.expand_dims(img_input, axis=0)     # add batch dimension

        # Use the CNN for prediction (requires proper training for accuracy)
        prediction = model.predict(img_input)
        label = "Drowning" if prediction[0][0] > 0.5 else "Normal"
        
        # Display label on the original frame
        cv2.putText(frame, label, (10, 30),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
        
        cv2.imshow('Live Video', frame)
        cv2.imshow('Foreground Mask', fgMask)

        if cv2.waitKey(30) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

# Run the detection process
process_video()



Simply add these cells in the order shown, and run them in your notebook (ipynb). Adjust parameters and paths as needed.

In [None]:
# YOLO for Human Detection
import cv2
import numpy as np

# Paths to your YOLO configuration, weights, and COCO class names
config_path = 'yolov3.cfg'
weights_path = 'yolov3.weights'
names_path = 'coco.names'

# Load COCO class names
with open(names_path, 'r') as f:
    classes = f.read().strip().split('\n')

# Load YOLO using OpenCV's DNN module
net = cv2.dnn.readNetFromDarknet(config_path, weights_path)
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)

def get_yolo_detections(frame):
    # Create a blob from the input frame
    blob = cv2.dnn.blobFromImage(frame, 1/255.0, (416, 416), swapRB=True, crop=False)
    net.setInput(blob)
    
    # Get output layer names
    ln = net.getLayerNames()
    ln = [ln[i[0]-1] for i in net.getUnconnectedOutLayers()]
    
    layer_outputs = net.forward(ln)
    boxes, confidences = [], []
    
    H, W = frame.shape[:2]
    for output in layer_outputs:
        for detection in output:
            scores = detection[5:]
            classID = np.argmax(scores)
            confidence = scores[classID]
            if classes[classID] == "person" and confidence > 0.5:
                box = detection[0:4] * np.array([W, H, W, H])
                centerX, centerY, width, height = box.astype("int")
                x = int(centerX - width / 2)
                y = int(centerY - height / 2)
                boxes.append([x, y, int(width), int(height)])
                confidences.append(float(confidence))
                
    idxs = cv2.dnn.NMSBoxes(boxes, confidences, score_threshold=0.5, nms_threshold=0.3)
    detections = []
    if len(idxs) > 0:
        for i in idxs.flatten():
            x, y, w_box, h_box = boxes[i]
            detections.append((x, y, w_box, h_box))
    return detections

In [None]:
# SORT for Tracking
#  Initialize a CSRT tracker for single object tracking
def initialize_tracker(frame, bbox):
    tracker = cv2.TrackerCSRT_create()  # CSRT is effective for SOT tasks
    tracker.init(frame, bbox)
    return tracker

# Open a video capture (webcam or file)
cap = cv2.VideoCapture(0)
ret, frame = cap.read()
if not ret:
    print("Failed to acquire a frame.")
else:
    detections = get_yolo_detections(frame)
    if detections:
        # Select the first detected person
        bbox = tuple(detections[0])
        tracker = initialize_tracker(frame, bbox)
        print("Tracker initialized with bounding box:", bbox)
    else:
        print("No person detected to initialize tracker.")

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

# Build a simple CNN model for drowning classification.
def build_drowning_classifier(input_shape=(64, 64, 1)):
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        MaxPooling2D((2, 2)),
        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Flatten(),
        Dense(128, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

drowning_model = build_drowning_classifier()
print("Drowning classifier model built successfully.")

# Tracking loop: update tracker, predict drowning status, and display results
while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Update the tracker
    ok, bbox = tracker.update(frame)
    if ok:
        (x, y, w_box, h_box) = tuple(map(int, bbox))
        cv2.rectangle(frame, (x, y), (x+w_box, y+h_box), (255, 0, 0), 2)
        
        # Extract region of interest (ROI) for classification
        roi = frame[y:y+h_box, x:x+w_box]
        if roi.size != 0:
            roi_gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
            roi_resized = cv2.resize(roi_gray, (64, 64))
            roi_norm = roi_resized.astype('float32') / 255.0
            roi_input = np.expand_dims(roi_norm, axis=-1)
            roi_input = np.expand_dims(roi_input, axis=0)
            
            # Predict drowning status (this is untrained; train with your data)
            prediction = drowning_model.predict(roi_input)
            label = "Drowning" if prediction[0][0] > 0.5 else "Normal"
            cv2.putText(frame, label, (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
    else:
        cv2.putText(frame, "Tracking failure", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 0, 255), 2)
        
    cv2.imshow("Tracking", frame)
    if cv2.waitKey(30) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()