In [None]:
import os
import numpy as np
import librosa
import sounddevice as sd
import tensorflow as tf
import tensorflow_hub as hub
import threading
import cv2
from scipy.io.wavfile import write
from scipy.signal import butter, lfilter
from ultralytics import YOLO
from datetime import datetime

# === Model Loading ===
yolo_model_path = 'yolo_v1 (2).pt'  # YOLO model path

# Load models
audio_model = tf.keras.models.load_model("best_model.keras")
yamnet_model = hub.load('https://www.kaggle.com/models/google/yamnet/TensorFlow2/yamnet/1')
yolo_model = YOLO(yolo_model_path)

# Ensure required directories exist
os.makedirs("anomalous_videos", exist_ok=True)
os.makedirs("anomalous_audio", exist_ok=True)

# Index-to-label mapping for YAMNet
index_to_label = {
    0: "Emergency_alert_sound",
    1: "Explosions",
    2: "Gunshots",
    3: "Human screams",
    4: "Bottles breaking",
    5: "Dog bark"
    # Add more labels as per your trained model
}

# Noise reduction using a Butterworth filter
def butter_bandpass(lowcut, highcut, fs, order=5):
    nyquist = 0.5 * fs
    low = lowcut / nyquist
    high = highcut / nyquist
    b, a = butter(order, [low, high], btype='band')
    return b, a

def bandpass_filter(data, lowcut=300.0, highcut=3400.0, fs=16000, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data)
    return y

# Preprocess and generate YAMNet embeddings
def preprocess_audio(audio_waveform, target_length=16000 * 5):
    # Apply noise reduction
    audio_waveform = bandpass_filter(audio_waveform)

    # Pad or truncate the waveform to the target length
    if len(audio_waveform) < target_length:
        audio_waveform = np.pad(audio_waveform, (0, target_length - len(audio_waveform)))
    else:
        audio_waveform = audio_waveform[:target_length]

    # Normalize between -1 and 1
    audio_waveform = audio_waveform.astype(np.float32)
    audio_waveform /= np.max(np.abs(audio_waveform))
    return audio_waveform

def predict_audio_from_stream(audio_data, sr, confidence_threshold=0.7):
    processed_audio = preprocess_audio(audio_data)
    _, yamnet_embeddings, _ = yamnet_model(processed_audio)
    avg_embedding = tf.reduce_mean(yamnet_embeddings, axis=0).numpy().reshape(1, -1)

    prediction = audio_model.predict(avg_embedding)
    predicted_class_index = np.argmax(prediction, axis=1)[0]
    confidence = prediction[0][predicted_class_index]

    if confidence >= confidence_threshold:
        predicted_class_name = index_to_label.get(predicted_class_index, "Unknown")
        return predicted_class_name, confidence
    return "Unknown", confidence

# Audio capturing and processing thread
def process_audio_stream(duration=5, sample_rate=16000):
    print("[INFO] Starting audio detection thread...")
    while True:
        print("[INFO] Recording audio...")
        audio_data = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, dtype='float32')
        sd.wait()  # Wait until the recording is finished
        audio_data = audio_data.flatten()

        predicted_class, confidence = predict_audio_from_stream(audio_data, sample_rate)

        if predicted_class != "Unknown":
            print(f"[{datetime.now()}] Detected audio anomaly: {predicted_class} (Confidence: {confidence:.2f})")
            audio_filename = f"anomalous_audio/{predicted_class}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.wav"
            write(audio_filename, sample_rate, (audio_data * 32767).astype(np.int16))
            print(f"[INFO] Audio saved to: {audio_filename}")

# Video threat detection
def process_video_stream(video_source=0):
    cap = cv2.VideoCapture(video_source)
    if not cap.isOpened():
        print("Error: Could not open video stream")
        return

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS)) or 30

    out = None
    recording = False

    while True:
        ret, frame = cap.read()
        if not ret:
            print("Warning: Failed to capture frame, skipping...")
            continue

        results = yolo_model.predict(frame, conf=0.5, verbose=False)
        annotated_frame = results[0].plot()

        anomalies_detected = False
        if results[0].boxes:
            for box in results[0].boxes:
                cls_id = int(box.cls)
                confidence = box.conf.item()
                label = yolo_model.names[cls_id]

                print(f"[{datetime.now()}] Detected: {label} (Confidence: {confidence:.2f})")
                if label in ["violence", "weaponized"]:
                    anomalies_detected = True

                    if not recording:
                        video_filename = f"anomalous_videos/{label}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
                        out = cv2.VideoWriter(video_filename, fourcc, fps, (frame_width, frame_height))
                        recording = True
                        print(f"[{datetime.now()}] Recording started: {video_filename}")

                    if out:
                        out.write(frame)
                    break

        if not anomalies_detected and recording:
            print(f"[{datetime.now()}] Anomaly ended. Stopping recording.")
            recording = False
            if out:
                out.release()
                out = None

        if cv2.waitKey(1) & 0xFF == ord('q'):
            print("Exit requested. Stopping program.")
            break

    if recording and out:
        out.release()
    cap.release()
    cv2.destroyAllWindows()

# Unified system
def unified_system(video_source=0):
    video_thread = threading.Thread(target=process_video_stream, args=(video_source,))
    audio_thread = threading.Thread(target=process_audio_stream)

    video_thread.start()
    audio_thread.start()

    video_thread.join()
    audio_thread.join()

# Entry point
if __name__ == "__main__":
    unified_system(video_source=0)















[INFO] Starting audio detection thread...
[INFO] Recording audio...
[2025-01-11 20:30:05.034120] Detected: non-violence (Confidence: 0.98)
[2025-01-11 20:30:05.159590] Detected: non-violence (Confidence: 0.98)
[2025-01-11 20:30:05.278436] Detected: non-violence (Confidence: 0.98)
[2025-01-11 20:30:05.387172] Detected: non-violence (Confidence: 0.98)
[2025-01-11 20:30:05.490627] Detected: non-violence (Confidence: 0.98)
[2025-01-11 20:30:05.600810] Detected: non-violence (Confidence: 0.98)
[2025-01-11 20:30:05.698677] Detected: non-violence (Confidence: 0.98)
[2025-01-11 20:30:05.798740] Detected: non-violence (Confidence: 0.98)
[2025-01-11 20:30:05.888357] Detected: non-violence (Confidence: 0.98)
[2025-01-11 20:30:05.986767] Detected: non-violence (Confidence: 0.98)
[2025-01-11 20:30:06.086471] Detected: non-violence (Confidence: 0.98)
[2025-01-11 20:30:06.182957] Detected: non-violence (Confidence: 0.98)
[2025-01-11 20:30:06.316162] Detected: non-violence (Confidence: 0.98)
[2025-01-

In [1]:
import sounddevice as sd
print(sd.query_devices())


   0 Microsoft Sound Mapper - Input, MME (2 in, 0 out)
>  1 Microphone Array (Intel® Smart , MME (2 in, 0 out)
   2 Microsoft Sound Mapper - Output, MME (0 in, 2 out)
<  3 Speaker (Realtek(R) Audio), MME (0 in, 2 out)
   4 Primary Sound Capture Driver, Windows DirectSound (2 in, 0 out)
   5 Microphone Array (Intel® Smart Sound Technology for Digital Microphones), Windows DirectSound (2 in, 0 out)
   6 Primary Sound Driver, Windows DirectSound (0 in, 2 out)
   7 Speaker (Realtek(R) Audio), Windows DirectSound (0 in, 2 out)
   8 Speaker (Realtek(R) Audio), Windows WASAPI (0 in, 2 out)
   9 Microphone Array (Intel® Smart Sound Technology for Digital Microphones), Windows WASAPI (2 in, 0 out)
  10 Microphone Array 1 (), Windows WDM-KS (2 in, 0 out)
  11 Microphone Array 2 (), Windows WDM-KS (2 in, 0 out)
  12 Microphone Array 3 (), Windows WDM-KS (4 in, 0 out)
  13 Speakers 1 (Realtek HD Audio output with SST), Windows WDM-KS (0 in, 2 out)
  14 Speakers 2 (Realtek HD Audio output with SST)