In [30]:
import os
import numpy as np
import librosa
import tensorflow as tf
import tensorflow_hub as hub
import threading
import cv2
from ultralytics import YOLO
from datetime import datetime

# === Model Loading ===
yolo_model_path = 'yolo_v1 (2).pt'  # YOLO model path

# Load models
audio_model = tf.keras.models.load_model("best_model.keras")
yolo_model = YOLO(yolo_model_path)

# Ensure required directories exist
os.makedirs("anomalous_videos", exist_ok=True)
os.makedirs("anomalous_audio", exist_ok=True)

# Index-to-label mapping for YAMNet
index_to_label = {
    0: "Emergency_alert_sound",
    1: "Explosions",
    2:"Gunshots",
    3:"Human screams",
    4:"bottles breaking",
    5:"dog bark"
    # Add more labels as per your trained model
}

def load_audio(file_path, target_sr=16000):
    """Load audio and preprocess to 16 kHz mono."""
    audio, sr = librosa.load(file_path, sr=target_sr, mono=True)
    return audio

def preprocess_audio(audio_waveform, target_length=16000 * 5):
    """Pad or truncate the waveform to 5 seconds (80,000 samples) and reshape into chunks."""
    if len(audio_waveform) < target_length:
        audio_waveform = np.pad(audio_waveform, (0, target_length - len(audio_waveform)))
    else:
        audio_waveform = audio_waveform[:target_length]

    # Normalize between -1 and 1
    audio_waveform = audio_waveform.astype(np.float32)
    audio_waveform /= np.max(np.abs(audio_waveform))

    # Reshape into chunks of 1024
    num_chunks = target_length // 1024  # Ensure the total length is divisible by 1024
    reshaped_audio = audio_waveform[:num_chunks * 1024].reshape(-1, 1024)
    return reshaped_audio


def extract_audio_features(audio_waveform):
    # """Use YAMNet to extract audio features."""
    # model_url = "https://tfhub.dev/google/yamnet/1"
    # yamnet_model = hub.load(model_url)

    # YAMNet model expects a 1D audio waveform, not a 2D input.
    scores, embeddings, spectrogram = audio_model(audio_waveform)
    
    # Extract the mean of embeddings (1024 features)
    return np.mean(embeddings, axis=0).reshape(1, -1)  # Shape (1, 1024)

def predict_audio(audio_file, confidence_threshold=0.7):
    audio = load_audio(audio_file)
    processed_audio = preprocess_audio(audio)  # Returns shape (batch_size, 1024)

    # Feed batches into the model
    predictions = audio_model.predict(processed_audio)  # Returns predictions for each chunk
    predicted_class_index = np.argmax(predictions, axis=1)
    confidence = np.max(predictions, axis=1)

    # Aggregate results if necessary
    overall_class = index_to_label[np.argmax(np.bincount(predicted_class_index))]
    print("Predicted class :", overall_class)
    overall_confidence = np.mean(confidence)

    if overall_confidence < confidence_threshold:
        return "Unknown", overall_confidence

    return overall_class, overall_confidence


# === Video Threat Detection ===
def process_video_stream(video_source=0):
    cap = cv2.VideoCapture(video_source)  # Use 0 for webcam
    if not cap.isOpened():
        print("Error: Could not open video stream")
        return

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS)) or 30

    out = None
    recording = False

    while True:
        ret, frame = cap.read()
        if not ret:
            print("Warning: Failed to capture frame, skipping...")
            continue

        results = yolo_model.predict(frame, conf=0.5, verbose=False)
        annotated_frame = results[0].plot()

        anomalies_detected = False
        if results[0].boxes:
            for box in results[0].boxes:
                cls_id = int(box.cls)
                confidence = box.conf.item()
                label = yolo_model.names[cls_id]

                print(f"[{datetime.now()}] Detected: {label} (Confidence: {confidence:.2f})")
                if label in ["violence", "weaponized"]:
                    anomalies_detected = True

                    if not recording:
                        video_filename = f"anomalous_videos/{label}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
                        out = cv2.VideoWriter(video_filename, fourcc, fps, (frame_width, frame_height))
                        recording = True
                        print(f"[{datetime.now()}] Recording started: {video_filename}")

                    if out:
                        out.write(frame)
                    break

        if not anomalies_detected and recording:
            print(f"[{datetime.now()}] Anomaly ended. Stopping recording.")
            recording = False
            if out:
                out.release()
                out = None

        if cv2.waitKey(1) & 0xFF == ord('q'):
            print("Exit requested. Stopping program.")
            break

    if recording and out:
        out.release()
    cap.release()
    cv2.destroyAllWindows()

# === Unified System ===
def unified_system(audio_file=None, video_source=0):
    """Unified system for audio and video threat detection."""
    video_thread = threading.Thread(target=process_video_stream, args=(video_source,))
    audio_thread = None

    if audio_file:
        def audio_task():
            predicted_class, confidence = predict_audio(audio_file, confidence_threshold=0.7)
            # print(predicted_class, confidence)
            if predicted_class == "Unknown":
                print("The audio file does not belong to any trained class.")
            else:
                print(f"Predicted class: {predicted_class}")
                print(f"Confidence: {confidence:.2f}")

        audio_thread = threading.Thread(target=audio_task)
    # video_thread.start()
    if audio_thread:
        audio_thread.start()
    # video_thread.start()
    if audio_thread:
        audio_thread.join()

# Entry point
if __name__ == "__main__":
    audio_file_path = "R01-53-Medium-Sized Dog Barking.wav.mp3"  # Replace with your audio file path
    unified_system(audio_file=audio_file_path, video_source=0)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step 
Predicted class : Explosions
The audio file does not belong to any trained class.
