In [9]:
# from moviepy.editor import VideoFileClip
from audio_extract import extract_audio


In [16]:

def video_to_audio(video_path, audio_path):
    """
    Convert a video file to an audio file in .wav format.

    Args:
        video_path (str): Path to the input video file.
        audio_path (str): Path to save the output .wav file.
    """
    try:
        extract_audio(input_path=video_path, output_path=audio_path)

    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage:
video_path = "input_video.mp4"  # Path to your video file
audio_path = "audio2.mp3"  # Path to save the extracted audio
video_to_audio(video_path, audio_path)


Success : audio file has been saved to "d:\VS Code Folders\audio_activity_detection\audio2.mp3".


In [None]:
import json
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader, random_split
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Your custom Dataset class
class AudioDataset(torch.utils.data.Dataset):
    def __init__(self, audio_files, annotations):
        self.audio_files = audio_files
        self.annotations = annotations
        self.sr = 22050
        self.hop_length = 512
        self.n_mels = 64

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, idx):
        audio_file = self.audio_files[idx]
        timestamps = self.annotations[audio_file]
        
        # Here you would load your audio, extract features, and generate labels as discussed previously.
        # Placeholder logic for mel_spec and labels for simplicity
        mel_spec = torch.randn(1, 64, 100)  # Dummy mel spectrogram (this should be actual feature extraction)
        labels = torch.randn(100)           # Dummy labels (should be actual binary activity labels)
        
        return mel_spec, labels

# Define the LSTM model (Your LSTMSoundClassifier)
class LSTMSoundClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMSoundClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = self.fc(lstm_out[:, -1, :])  # Only take the last LSTM output
        out = self.sigmoid(out)  # Apply sigmoid for binary classification
        return out

# Function to train model with accuracy and progress bar
def train_model(model, train_dataloader, val_dataloader, criterion, optimizer, epochs=10):
    model.train()  # Set model to training mode

    for epoch in range(epochs):
        total_loss = 0
        correct_predictions = 0
        total_samples = 0

        # Training loop with tqdm progress bar
        with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch+1}/{epochs}") as pbar:
            for mel_spec, labels in train_dataloader:
                optimizer.zero_grad()  # Reset gradients
                outputs = model(mel_spec)  # Forward pass
                loss = criterion(outputs.squeeze(), labels.squeeze())  # Calculate loss
                loss.backward()  # Backpropagation
                optimizer.step()  # Update weights

                # Accumulate loss
                total_loss += loss.item()

                # Calculate accuracy (simple threshold at 0.5 for binary classification)
                predictions = (outputs.squeeze() > 0.5).float()
                correct_predictions += (predictions == labels).sum().item()
                total_samples += labels.numel()

                # Update the progress bar
                pbar.set_postfix(loss=f"{loss.item():.4f}")
                pbar.update(1)

        # Calculate average loss and accuracy for the epoch
        avg_loss = total_loss / len(train_dataloader)
        accuracy = correct_predictions / total_samples

        # Validation phase
        model.eval()
        val_loss = 0
        correct_val_predictions = 0
        total_val_samples = 0
        with torch.no_grad():  # No gradients during validation
            for mel_spec, labels in val_dataloader:
                outputs = model(mel_spec)
                loss = criterion(outputs.squeeze(), labels.squeeze())
                val_loss += loss.item()

                predictions = (outputs.squeeze() > 0.5).float()
                correct_val_predictions += (predictions == labels).sum().item()
                total_val_samples += labels.numel()

        val_avg_loss = val_loss / len(val_dataloader)
        val_accuracy = correct_val_predictions / total_val_samples

        # Print epoch summary
        print(f"Epoch {epoch+1}/{epochs} - "
              f"Train Loss: {avg_loss:.4f}, Train Accuracy: {accuracy:.4f} - "
              f"Validation Loss: {val_avg_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")
        
        model.train()  # Set back to training mode after validation

# Example Usage
if __name__ == "__main__":
    # Load JSON annotations
    with open("annotations.json", "r") as f:
        annotations = json.load(f)

    # Audio files list
    audio_files = list(annotations.keys())

    # Create Dataset
    dataset = AudioDataset(audio_files, annotations)

    # Split into training and validation sets (80% train, 20% validation)
    train_files, val_files = train_test_split(audio_files, test_size=0.2, random_state=42)

    # Filter dataset based on the train and validation splits
    train_dataset = AudioDataset(train_files, annotations)
    val_dataset = AudioDataset(val_files, annotations)

    # Create DataLoaders
    train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False)

    # Initialize Model, Loss, Optimizer
    model = LSTMSoundClassifier(input_size=64, hidden_size=128, num_layers=2, output_size=1)
    criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Train Model
    train_model(model, train_dataloader, val_dataloader, criterion, optimizer, epochs=10)
