In [1]:
# Install required packages
!pip install librosa tensorflow-hub panns-inference kagglehub tqdm

# Import libraries
import os
import numpy as np
import pandas as pd
import librosa
import tensorflow as tf
import tensorflow_hub as hub
import torch
import torchaudio
from tqdm import tqdm
import warnings
import kagglehub
warnings.filterwarnings('ignore')

Collecting panns-inference
  Downloading panns_inference-0.1.1-py3-none-any.whl.metadata (2.4 kB)
Collecting torchlibrosa (from panns-inference)
  Downloading torchlibrosa-0.1.0-py3-none-any.whl.metadata (3.5 kB)
Downloading panns_inference-0.1.1-py3-none-any.whl (8.3 kB)
Downloading torchlibrosa-0.1.0-py3-none-any.whl (11 kB)
Installing collected packages: torchlibrosa, panns-inference
Successfully installed panns-inference-0.1.1 torchlibrosa-0.1.0


In [2]:
# Check GPU availability
print("GPU Available:", tf.config.experimental.list_physical_devices('GPU'))
print("CUDA Available:", torch.cuda.is_available())


GPU Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
CUDA Available: True


In [3]:
# ==========================================
# STEP 2: Download Dataset
# ==========================================

print("Downloading dataset...")
dataset_path = kagglehub.dataset_download("murtadhanajim/gender-recognition-by-voiceoriginal")
print(f"Dataset downloaded to: {dataset_path}")

Downloading dataset...
Dataset downloaded to: /kaggle/input/gender-recognition-by-voiceoriginal


In [4]:
# ==========================================
# STEP 3: Model Loading Functions
# ==========================================

class AudioFeatureExtractor:
    def __init__(self):
        """Initialize the three pre-trained models exactly as described in the paper"""
        print("Loading pre-trained models...")

        # 1. VGGish Model (128-dimensional features)
        print("Loading VGGish...")
        self.vggish_model = hub.load('https://tfhub.dev/google/vggish/1')

        # 2. YAMNet Model (1024-dimensional features)
        print("Loading YAMNet...")
        self.yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1')

        # 3. PANNs Model (2048-dimensional features)
        print("Setting up PANNs...")
        self.setup_panns()

        print("All models loaded successfully!")

    def setup_panns(self):
        """Setup PANNs model"""
        try:
            from panns_inference import AudioTagging
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            self.panns_model = AudioTagging(checkpoint_path=None, device=device)
            print(f"PANNs loaded on {device}")
        except Exception as e:
            print(f"Error loading PANNs: {e}")
            print("Installing panns_inference...")
            !pip install panns_inference
            from panns_inference import AudioTagging
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            self.panns_model = AudioTagging(checkpoint_path=None, device=device)

    def load_audio(self, file_path, target_sr=16000):
        """Load and preprocess audio file"""
        try:
            audio, sr = librosa.load(file_path, sr=target_sr, mono=True)
            return audio, sr
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
            return None, None

    def extract_vggish_features(self, audio, sr=16000):
        """
        Extract VGGish features (128-dimensional)
        Paper: "VGGish model generates a 128-dimensional deep feature vector for every 0.96 s"
        "we averaged those 128-dimensional VGGish-based deep features"
        """
        try:
            if len(audio) == 0:
                return np.zeros(128)

            # VGGish expects float32 tensor
            audio_tensor = tf.convert_to_tensor(audio, dtype=tf.float32)

            # Extract embeddings - returns features for each 0.96s segment
            embeddings = self.vggish_model(audio_tensor)

            # Average across time dimension as described in paper
            if len(embeddings.shape) > 1:
                features = tf.reduce_mean(embeddings, axis=0)
            else:
                features = embeddings

            return features.numpy()

        except Exception as e:
            print(f"VGGish extraction error: {e}")
            return np.zeros(128)

    def extract_yamnet_features(self, audio, sr=16000):
        """
        Extract YAMNet features (1024-dimensional)
        Paper: "YAMNet produces a 1024-dimensional deep feature vector for every 0.48 s"
        "we averaged those 1024-dimensional YAMNet-based deep features"
        """
        try:
            if len(audio) == 0:
                return np.zeros(1024)

            audio_tensor = tf.convert_to_tensor(audio, dtype=tf.float32)

            # Get YAMNet outputs: scores, embeddings, spectrogram
            scores, embeddings, spectrogram = self.yamnet_model(audio_tensor)

            # Average embeddings across time as described in paper
            if len(embeddings.shape) > 1:
                features = tf.reduce_mean(embeddings, axis=0)
            else:
                features = embeddings

            return features.numpy()

        except Exception as e:
            print(f"YAMNet extraction error: {e}")
            return np.zeros(1024)

    def extract_panns_features(self, audio, sr=16000):
        """
        Extract PANNs features (2048-dimensional)
        Paper: "PANNs are pre-trained models specifically developed for audio pattern recognition"
        "These audio patterns are then mapped to a 2048-dimensional output space"
        """
        try:
            if len(audio) == 0:
                return np.zeros(2048)

            # PANNs typically expects 32kHz audio
            if sr != 32000:
                audio_32k = librosa.resample(audio, orig_sr=sr, target_sr=32000)
            else:
                audio_32k = audio

            # Get PANNs inference
            (clipwise_output, embedding) = self.panns_model.inference(audio_32k[None, :])

            # Ensure 2048 dimensions as specified in paper
            if embedding.shape[1] == 2048:
                return embedding[0]
            elif embedding.shape[1] > 2048:
                return embedding[0][:2048]  # Truncate if larger
            else:
                # Pad if smaller
                padded = np.zeros(2048)
                padded[:embedding.shape[1]] = embedding[0]
                return padded

        except Exception as e:
            print(f"PANNs extraction error: {e}")
            return np.zeros(2048)

    def extract_all_features(self, file_path):
        """Extract features from all three models for a single audio file"""
        # Load audio at 16kHz (standard for these models)
        audio, sr = self.load_audio(file_path, target_sr=16000)

        if audio is None:
            return {
                'vggish': np.zeros(128),
                'yamnet': np.zeros(1024),
                'panns': np.zeros(2048)
            }

        # Extract features from each model
        features = {
            'vggish': self.extract_vggish_features(audio, sr),
            'yamnet': self.extract_yamnet_features(audio, sr),
            'panns': self.extract_panns_features(audio, sr)
        }

        return features

In [5]:
# ==========================================
# STEP 4: Dataset Processing Function
# ==========================================

def process_gender_dataset(dataset_path, output_dir='./audio_features'):
    """Process the gender recognition dataset and extract features"""

    os.makedirs(output_dir, exist_ok=True)

    # Initialize extractor
    print("Initializing feature extractor...")
    extractor = AudioFeatureExtractor()

    # Find all audio files
    audio_extensions = ['.wav', '.mp3', '.flac', '.m4a', '.ogg']
    audio_files = []

    print("Scanning for audio files...")
    for root, dirs, files in os.walk(dataset_path):
        for file in files:
            if any(file.lower().endswith(ext) for ext in audio_extensions):
                audio_files.append(os.path.join(root, file))

    print(f"Found {len(audio_files)} audio files")

    if len(audio_files) == 0:
        print("No audio files found! Please check the dataset path.")
        return None, None

    # Initialize feature storage
    features_dict = {
        'vggish': [],
        'yamnet': [],
        'panns': []
    }

    file_metadata = []

    # Process each file
    print("Extracting features...")
    for i, file_path in enumerate(tqdm(audio_files, desc="Processing audio files")):
        try:
            # Extract features
            features = extractor.extract_all_features(file_path)

            # Store features
            features_dict['vggish'].append(features['vggish'])
            features_dict['yamnet'].append(features['yamnet'])
            features_dict['panns'].append(features['panns'])

            # Extract metadata from filename (assuming gender is encoded in filename)
            filename = os.path.basename(file_path)

            # Try to extract gender from filename (adapt this based on your dataset structure)
            gender = 'unknown'  # Default
            if 'male' in filename.lower() or 'm_' in filename.lower():
                gender = 'male'
            elif 'female' in filename.lower() or 'f_' in filename.lower():
                gender = 'female'

            file_metadata.append({
                'index': i,
                'filename': filename,
                'file_path': file_path,
                'gender': gender
            })

        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            # Add zero features for failed files
            features_dict['vggish'].append(np.zeros(128))
            features_dict['yamnet'].append(np.zeros(1024))
            features_dict['panns'].append(np.zeros(2048))

            file_metadata.append({
                'index': i,
                'filename': os.path.basename(file_path),
                'file_path': file_path,
                'gender': 'unknown'
            })

    # Convert to numpy arrays
    for model_name in features_dict:
        features_dict[model_name] = np.array(features_dict[model_name])
        print(f"{model_name.upper()} features shape: {features_dict[model_name].shape}")

    # Save features
    print("Saving extracted features...")
    for model_name, features_array in features_dict.items():
        np.save(os.path.join(output_dir, f'{model_name}_features.npy'), features_array)

    # Save metadata
    metadata_df = pd.DataFrame(file_metadata)
    metadata_df.to_csv(os.path.join(output_dir, 'file_metadata.csv'), index=False)

    print(f"Features saved to: {output_dir}")

    # Display feature statistics
    print("\n" + "="*50)
    print("FEATURE EXTRACTION SUMMARY")
    print("="*50)
    print(f"Total files processed: {len(file_metadata)}")

    for model_name, features_array in features_dict.items():
        print(f"\n{model_name.upper()} Features:")
        print(f"  Shape: {features_array.shape}")
        print(f"  Mean: {np.mean(features_array):.4f}")
        print(f"  Std: {np.std(features_array):.4f}")
        print(f"  Min: {np.min(features_array):.4f}")
        print(f"  Max: {np.max(features_array):.4f}")

    # Gender distribution
    print(f"\nGender Distribution:")
    print(metadata_df['gender'].value_counts())

    return features_dict, metadata_df


In [6]:
# ==========================================
# STEP 5: Execute Feature Extraction
# ==========================================

# Run the feature extraction
print("Starting feature extraction process...")
features, metadata = process_gender_dataset(dataset_path)

if features is not None:
    print("\n" + "="*50)
    print("FEATURE EXTRACTION COMPLETED SUCCESSFULLY!")
    print("="*50)

    # Display final results
    print("Feature dimensions match paper specifications:")
    print(f"✓ VGGish: {features['vggish'].shape[1]} dimensions (expected: 128)")
    print(f"✓ YAMNet: {features['yamnet'].shape[1]} dimensions (expected: 1024)")
    print(f"✓ PANNs: {features['panns'].shape[1]} dimensions (expected: 2048)")

    print(f"\nFeatures saved in './audio_features/' directory")
    print("Files created:")
    print("- vggish_features.npy")
    print("- yamnet_features.npy")
    print("- panns_features.npy")
    print("- file_metadata.csv")

else:
    print("Feature extraction failed. Please check the dataset path and file formats.")


Starting feature extraction process...
Initializing feature extractor...
Loading pre-trained models...
Loading VGGish...
Loading YAMNet...
Setting up PANNs...
Checkpoint path: /root/panns_data/Cnn14_mAP=0.431.pth
GPU number: 1
PANNs loaded on cuda
All models loaded successfully!
Scanning for audio files...
Found 16148 audio files
Extracting features...


Processing audio files: 100%|██████████| 16148/16148 [07:59<00:00, 33.66it/s]


VGGISH features shape: (16148, 128)
YAMNET features shape: (16148, 1024)
PANNS features shape: (16148, 2048)
Saving extracted features...
Features saved to: ./audio_features

FEATURE EXTRACTION SUMMARY
Total files processed: 16148

VGGISH Features:
  Shape: (16148, 128)
  Mean: nan
  Std: nan
  Min: nan
  Max: nan

YAMNET Features:
  Shape: (16148, 1024)
  Mean: 0.1065
  Std: 0.2188
  Min: 0.0000
  Max: 3.7249

PANNS Features:
  Shape: (16148, 2048)
  Mean: 0.0942
  Std: 0.2484
  Min: 0.0000
  Max: 4.3367

Gender Distribution:
gender
unknown    16148
Name: count, dtype: int64

FEATURE EXTRACTION COMPLETED SUCCESSFULLY!
Feature dimensions match paper specifications:
✓ VGGish: 128 dimensions (expected: 128)
✓ YAMNet: 1024 dimensions (expected: 1024)
✓ PANNs: 2048 dimensions (expected: 2048)

Features saved in './audio_features/' directory
Files created:
- vggish_features.npy
- yamnet_features.npy
- panns_features.npy
- file_metadata.csv


In [None]:
# ==========================================
# STEP 6: Feature Loading Function for Later Use
# ==========================================

'''def load_extracted_features(features_dir='./audio_features'):
    """Load previously extracted features"""
    try:
        features = {}
        features['vggish'] = np.load(os.path.join(features_dir, 'vggish_features.npy'))
        features['yamnet'] = np.load(os.path.join(features_dir, 'yamnet_features.npy'))
        features['panns'] = np.load(os.path.join(features_dir, 'panns_features.npy'))

        metadata = pd.read_csv(os.path.join(features_dir, 'file_metadata.csv'))

        print("Features loaded successfully!")
        for model_name, feature_array in features.items():
            print(f"{model_name}: {feature_array.shape}")

        return features, metadata

    except Exception as e:
        print(f"Error loading features: {e}")
        return None, None
'''

# features, metadata = load_extracted_features()



In [8]:
# prompt: Inspect the VGGish features array

print("VGGish Features Shape:", vggish_features.shape)
print("First 5 VGGish Features:")
print(vggish_features[:5])

VGGish Features Shape: (16148, 128)
First 5 VGGish Features:
[[-1.17662460e-01  1.06528535e-01 -1.36822879e-01 -5.79937458e-01
   1.45776987e-01 -2.72055119e-01 -8.25421095e-01  3.71271521e-01
  -7.66780555e-01 -6.77676082e-01 -8.69919896e-01 -2.54069090e-01
  -1.39017797e+00 -5.68271041e-01 -2.37168193e-01 -1.27535596e-01
  -3.36427718e-01  3.64160687e-01 -1.92880377e-01 -2.03614026e-01
   8.58277231e-02  1.40275657e-02 -2.23799914e-01  2.58785486e-01
   1.00702167e-01 -1.85411483e-01 -2.61564255e-02  6.81840360e-01
  -3.82189274e-01 -4.53151584e-01 -2.65098423e-01 -6.34601116e-02
  -3.90202016e-01  1.55591175e-01  5.95691442e-01 -3.30925405e-01
  -8.25355828e-01 -3.99493039e-01 -8.43472540e-01 -6.35698974e-01
   1.82024091e-01 -4.86520588e-01 -1.80906072e-01 -5.20078957e-01
   4.43429381e-01 -2.46865749e-02  3.82902056e-01  3.74834239e-02
  -3.42325866e-02  2.03785226e-01  4.10499275e-01 -5.24551034e-01
   2.63129592e-01 -1.23950016e+00  1.49423003e-01 -5.65248847e-01
   5.71267009e-

In [10]:
# Check for NaN values in VGGish features
nan_in_vggish = np.isnan(features['vggish']).any()
print(f"Are there any NaN values in VGGish features? {nan_in_vggish}")

if nan_in_vggish:
    nan_rows_vggish = np.isnan(features['vggish']).any(axis=1)
    num_nan_rows_vggish = np.sum(nan_rows_vggish)
    print(f"Number of rows with NaN values in VGGish features: {num_nan_rows_vggish}")

    # You can also find the indices of these rows
    nan_row_indices_vggish = np.where(nan_rows_vggish)[0]
    print(f"Indices of rows with NaN values (first 10): {nan_row_indices_vggish[:10]}")

    # To see the metadata for these files (assuming metadata is indexed the same way)
    if metadata is not None:
        print("\nMetadata for first 10 files with NaN in VGGish features:")
        display(metadata.iloc[nan_row_indices_vggish[:10]])

Are there any NaN values in VGGish features? True
Number of rows with NaN values in VGGish features: 4
Indices of rows with NaN values (first 10): [ 2214  7618 11722 14702]

Metadata for first 10 files with NaN in VGGish features:


Unnamed: 0,index,filename,file_path,gender
2214,2214,arctic_a0207(4).wav,/kaggle/input/gender-recognition-by-voiceorigi...,unknown
7618,7618,arctic_a0207(1).wav,/kaggle/input/gender-recognition-by-voiceorigi...,unknown
11722,11722,arctic_a0329(3).wav,/kaggle/input/gender-recognition-by-voiceorigi...,unknown
14702,14702,arctic_a0542(6).wav,/kaggle/input/gender-recognition-by-voiceorigi...,unknown
