1. Data Pre-processing

In [10]:
import os
import librosa
import numpy as np

FIXED_WIDTH = 400  # Adjust as needed for your model input

def audio_to_spectrogram(file_path):
    """ Converts audio file to a spectrogram """
    y, sr = librosa.load(file_path, sr=None)
    spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
    
    # Convert to dB
    spec_db = librosa.power_to_db(spec, ref=np.max)

    # Resize spectrogram to a fixed width (pad or truncate)
    if spec_db.shape[1] < FIXED_WIDTH:
        pad_width = FIXED_WIDTH - spec_db.shape[1]
        spec_db = np.pad(spec_db, ((0, 0), (0, pad_width)), mode='constant')
    else:
        spec_db = spec_db[:, :FIXED_WIDTH]

    return spec_db

def load_labels_from_protocol(protocol_path):
    """ Reads ASVspoof protocol file and returns a dictionary: {'filename': label} """
    labels = {}
    print(f"Loading labels from: {protocol_path}")
    
    with open(protocol_path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            # ASVspoof 2015 format: speaker | filename | system | key
            if len(parts) >= 4:
                filename = parts[1]
                key = parts[3]  # 'human' or 'spoof'
                
                # Label 1 for human (real), 0 for spoof
                labels[filename] = 1 if key == 'human' else 0
    return labels

def load_dataset(dataset_path, protocol_path, max_files=None):
    spectrograms, final_labels = [], []
    
    # 1. Load labels first
    file_labels = load_labels_from_protocol(protocol_path)
    
    # 2. Recursively find all .wav files in all subfolders (T1, T2, etc.)
    all_files = []
    print(f"Scanning for files in {dataset_path}...")
    
    for root, dirs, files in os.walk(dataset_path):
        # Only scan T* folders (training data)
        # This modifies the walk in-place to only descend into folders starting with 'T'
        dirs[:] = [d for d in dirs if d.startswith('T')]
        
        for file in files:
            if file.endswith(".wav"):
                # We store the full path to load it, and the filename to look up the label
                all_files.append(os.path.join(root, file))

    if max_files:
        all_files = all_files[:max_files]
        
    print(f"Found {len(all_files)} audio files in 'T' folders. Processing...")

    for file_path in all_files:
        # Extract just the filename without extension (e.g., "train_00001" from "wav/T1/train_00001.wav")
        file_name = os.path.basename(file_path)
        file_name_no_ext = os.path.splitext(file_name)[0]
        
        # Only process if we have a label for this file
        if file_name_no_ext in file_labels:
            try:
                spec = audio_to_spectrogram(file_path)
                spectrograms.append(spec)
                final_labels.append(file_labels[file_name_no_ext])
            except Exception as e:
                print(f"Error processing {file_name}: {e}")
        # Note: We skip files that don't have a label (often hidden files or mismatching protocol)

    # Convert to numpy arrays
    if len(spectrograms) == 0:
        print("ERROR: No valid data found! Check your paths and protocol file.")
        return np.array([]), np.array([])

    X = np.array(spectrograms)[..., np.newaxis]
    y = np.array(final_labels)
    
    return X, y

if __name__ == "__main__":
    # --- CONFIGURATION (Based on your previous findings) ---
    
    dataset_folder = r"c:\Users\Bharat\Audio-Deepfake-Detection-using-Pretrained-Model\ASVspoof 2015 DS_10283_853\wav"
    protocol_file = r"c:\Users\Bharat\Audio-Deepfake-Detection-using-Pretrained-Model\ASVspoof 2015 DS_10283_853\CM_protocol\cm_train.trn"
    
    # ---------------------

    print(f"Processing data from: {dataset_folder}")
    print(f"Using protocol: {protocol_file}")

    if os.path.exists(dataset_folder) and os.path.exists(protocol_file):
        # Set max_files=None to process the entire dataset
        X, y = load_dataset(dataset_folder, protocol_file, max_files=None) 
        
        if X.size > 0:
            print(f"Data Loaded. X shape: {X.shape}, y shape: {y.shape}")
            np.save("X.npy", X)
            np.save("y.npy", y)
            print("SUCCESS: Saved X.npy and y.npy")
        else:
            print("ERROR: No data was loaded. Check if the folders are empty.")
    else:
        print("Error: Paths not found despite the check.")

Processing data from: c:\Users\Bharat\Audio-Deepfake-Detection-using-Pretrained-Model\ASVspoof 2015 DS_10283_853\wav
Using protocol: c:\Users\Bharat\Audio-Deepfake-Detection-using-Pretrained-Model\ASVspoof 2015 DS_10283_853\CM_protocol\cm_train.trn
Loading labels from: c:\Users\Bharat\Audio-Deepfake-Detection-using-Pretrained-Model\ASVspoof 2015 DS_10283_853\CM_protocol\cm_train.trn
Scanning for files in c:\Users\Bharat\Audio-Deepfake-Detection-using-Pretrained-Model\ASVspoof 2015 DS_10283_853\wav...
Found 16375 audio files in 'T' folders. Processing...
Data Loaded. X shape: (16375, 128, 400, 1), y shape: (16375,)
SUCCESS: Saved X.npy and y.npy


2. Model Training

In [11]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
import os

def build_model(input_shape):
    """ Build a deepfake detection model using EfficientNetB0 """
    print(f"Building model with input shape: {input_shape}")
    
    # Load pre-trained EfficientNetB0
    # include_top=False means we cut off the final classification layer
    # weights='imagenet' uses weights learned from real-world images
    base_model = tf.keras.applications.EfficientNetB0(
        input_shape=input_shape, 
        include_top=False, 
        weights='imagenet'
    )
    
    # Freeze base model to keep pre-trained features (speeds up training)
    base_model.trainable = False  
    
    # Add our own custom layers for audio spoof detection
    model = models.Sequential([
        base_model,
        layers.GlobalAveragePooling2D(),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.5),                    # Prevents overfitting
        layers.Dense(1, activation='sigmoid')   # Output: 0=Spoof, 1=Bonafide
    ])
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    return model

if __name__ == "__main__":
    # 1. Load the data generated by preprocess.py
    print("Loading data...")
    if not os.path.exists("X.npy") or not os.path.exists("y.npy"):
        print("Error: X.npy or y.npy not found. Run preprocess.py first!")
        exit()

    X = np.load("X.npy")
    y = np.load("y.npy")

    print(f"Loaded X: {X.shape}, y: {y.shape}")

    # 2. Convert 1-channel grayscale to 3-channel RGB
    # EfficientNet was trained on color images, so it expects 3 channels.
    if X.shape[-1] == 1:
        print("Converting grayscale to 3-channel RGB...")
        X = np.repeat(X, 3, axis=-1)

    # 3. Split into Train (80%) and Validation (20%)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # 4. Build and Train
    # input_shape will be (128, 400, 3)
    model = build_model(input_shape=X.shape[1:])
    
    print("Starting training... (This may take a while)")
    history = model.fit(
        X_train, y_train, 
        epochs=10, 
        batch_size=32, 
        validation_data=(X_test, y_test)
    )
    
    # 5. Save the final model
    model.save("deepfake_detector.h5")
    print("SUCCESS: Model saved as deepfake_detector.h5")

Loading data...
Loaded X: (16375, 128, 400, 1), y: (16375,)
Converting grayscale to 3-channel RGB...
Building model with input shape: (128, 400, 3)
Starting training... (This may take a while)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  saving_api.save_model(


SUCCESS: Model saved as deepfake_detector.h5


3. Evaluate on Development data

In [None]:
# One by one processing
'''
import os
import numpy as np
import tensorflow as tf
import librosa
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# --- CONFIGURATION ---
# We use the paths we discovered earlier
DATASET_FOLDER = r"c:\Users\Bharat\Audio-Deepfake-Detection-using-Pretrained-Model\ASVspoof 2015 DS_10283_853\wav"
PROTOCOL_FILE  = r"c:\Users\Bharat\Audio-Deepfake-Detection-using-Pretrained-Model\ASVspoof 2015 DS_10283_853\CM_protocol\cm_develop.ndx"
MODEL_FILE     = "deepfake_detector.h5"

FIXED_WIDTH = 400

def audio_to_spectrogram(file_path):
    """ Converts audio file to a spectrogram (Same as preprocess.py) """
    y, sr = librosa.load(file_path, sr=None)
    spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
    spec_db = librosa.power_to_db(spec, ref=np.max)

    if spec_db.shape[1] < FIXED_WIDTH:
        pad_width = FIXED_WIDTH - spec_db.shape[1]
        spec_db = np.pad(spec_db, ((0, 0), (0, pad_width)), mode='constant')
    else:
        spec_db = spec_db[:, :FIXED_WIDTH]

    # IMPORTANT: Model expects (128, 400, 3), but this generates (128, 400)
    # We add dimensions to match:
    spec_db = spec_db[..., np.newaxis]       # (128, 400, 1)
    spec_db = np.repeat(spec_db, 3, axis=-1) # (128, 400, 3)
    
    return spec_db

def load_labels(protocol_path):
    labels = {}
    print(f"Loading labels from: {protocol_path}")
    with open(protocol_path, "r") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 4:
                file_name = parts[1]
                label = 1 if parts[3] == "human" else 0
                labels[file_name] = label
    return labels

def find_all_wav_files(root_folder):
    """ Creates a dictionary {filename: full_path} for quick lookup """
    print(f"Indexing audio files in {root_folder}...")
    file_map = {}
    for root, dirs, files in os.walk(root_folder):
        for file in files:
            if file.endswith(".wav"):
                # Store "train_00001" -> "path/to/train_00001.wav"
                name_no_ext = os.path.splitext(file)[0]
                file_map[name_no_ext] = os.path.join(root, file)
    return file_map

def evaluate_model():
    # 1. Load Model
    if not os.path.exists(MODEL_FILE):
        print("Error: Model file not found. Run train.py first.")
        return
    
    print(f"Loading model: {MODEL_FILE}...")
    model = tf.keras.models.load_model(MODEL_FILE)

    # 2. Load Labels and File Map
    if not os.path.exists(PROTOCOL_FILE):
        print(f"Error: Protocol file not found at {PROTOCOL_FILE}")
        print("Check if 'cm_develop.trn' exists or change it to 'cm_evaluation.trn'")
        return

    labels = load_labels(PROTOCOL_FILE)
    file_map = find_all_wav_files(DATASET_FOLDER)

    print(f"Found {len(labels)} labels in protocol.")
    print(f"Found {len(file_map)} wav files on disk.")

    # 3. Predict
    predictions = []
    actuals = []
    
    print("\nStarting Evaluation...")
    # Process files listed in the protocol
    count = 0
    total = len(labels)
    
    for file_name, label in labels.items():
        if file_name in file_map:
            try:
                file_path = file_map[file_name]
                
                # Preprocess
                spec = audio_to_spectrogram(file_path)
                spec = spec[np.newaxis, ...] # Add batch dimension: (1, 128, 400, 3)
                
                # Predict
                pred_prob = model.predict(spec, verbose=0)[0][0]
                pred_label = 1 if pred_prob > 0.5 else 0
                
                predictions.append(pred_label)
                actuals.append(label)
                
                count += 1
                if count % 100 == 0:
                    print(f"Processed {count}/{total} files...", end='\r')
                    
            except Exception as e:
                print(f"Error reading {file_name}: {e}")
        else:
            # This is common if you only downloaded the training set but protocol lists others
            # print(f"Missing file: {file_name}") 
            pass

    if len(predictions) == 0:
        print("No predictions made. Check if your protocol filenames match your audio files.")
        return

    # 4. Metrics
    print("\n\n" + "="*30)
    print("FINAL RESULTS")
    print("="*30)
    print(f"Accuracy:  {accuracy_score(actuals, predictions):.4f}")
    print(f"Precision: {precision_score(actuals, predictions):.4f}")
    print(f"Recall:    {recall_score(actuals, predictions):.4f}")
    print(f"F1 Score:  {f1_score(actuals, predictions):.4f}")
    print("="*30)

if __name__ == "__main__":
    evaluate_model()

'''

In [3]:
# (with Batching)

import os
import numpy as np
import tensorflow as tf
import librosa
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time

# --- CONFIGURATION ---
BASE_PATH = r"c:\Users\Bharat\Audio-Deepfake-Detection-using-Pretrained-Model\ASVspoof 2015 DS_10283_853"
DATASET_FOLDER = os.path.join(BASE_PATH, "wav")

# Use 'cm_evaluation.ndx' for the final test, or 'cm_develop.ndx' for dev
PROTOCOL_FILE = os.path.join(BASE_PATH, "CM_protocol", "cm_develop.ndx") 

MODEL_FILE  = "deepfake_detector.h5"
FIXED_WIDTH = 400
BATCH_SIZE  = 256  # Process 64 files at once (Much faster)

def audio_to_spectrogram(file_path):
    """ Loads audio and creates spectrogram (Optimized for failures) """
    try:
        y, sr = librosa.load(file_path, sr=None)
        spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
        spec_db = librosa.power_to_db(spec, ref=np.max)

        if spec_db.shape[1] < FIXED_WIDTH:
            pad_width = FIXED_WIDTH - spec_db.shape[1]
            spec_db = np.pad(spec_db, ((0, 0), (0, pad_width)), mode='constant')
        else:
            spec_db = spec_db[:, :FIXED_WIDTH]

        spec_db = spec_db[..., np.newaxis]
        spec_db = np.repeat(spec_db, 3, axis=-1)
        return spec_db
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None

def load_labels(protocol_path):
    labels = {}
    print(f"Loading labels from: {protocol_path}")
    with open(protocol_path, "r") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 4:
                file_name = parts[1]
                key = parts[3]
                label = 1 if key == "human" else 0
                labels[file_name] = label
    return labels

def find_all_wav_files(root_folder):
    print(f"Indexing audio files in {root_folder}...")
    file_map = {}
    for root, dirs, files in os.walk(root_folder):
        for file in files:
            if file.endswith(".wav"):
                name_no_ext = os.path.splitext(file)[0]
                file_map[name_no_ext] = os.path.join(root, file)
    return file_map

def evaluate_model():
    if not os.path.exists(MODEL_FILE):
        print("Error: deepfake_detector.h5 not found.")
        return
    
    print(f"Loading model: {MODEL_FILE}...")
    model = tf.keras.models.load_model(MODEL_FILE)

    labels = load_labels(PROTOCOL_FILE)
    file_map = find_all_wav_files(DATASET_FOLDER)

    print(f"Protocol has {len(labels)} files.")
    print(f"Found {len(file_map)} .wav files on disk.")

    predictions = []
    actuals = []
    
    print("\nStarting Fast Evaluation (Batch Size: 64)...")
    start_time = time.time()
    
    # Batch containers
    batch_images = []
    batch_labels = []
    
    total_files = len(labels)
    processed_count = 0

    for file_name, label in labels.items():
        if file_name in file_map:
            file_path = file_map[file_name]
            spec = audio_to_spectrogram(file_path)
            
            if spec is not None:
                batch_images.append(spec)
                batch_labels.append(label)

            # When batch is full, predict
            if len(batch_images) == BATCH_SIZE:
                # Convert list to numpy array: (64, 128, 400, 3)
                batch_np = np.array(batch_images)
                
                # Predict entire batch at once
                preds = model.predict(batch_np, verbose=0)
                
                # Store results
                for p in preds:
                    predictions.append(1 if p[0] > 0.5 else 0)
                
                actuals.extend(batch_labels)
                
                # Reset batch
                batch_images = []
                batch_labels = []
                
                processed_count += BATCH_SIZE
                
                # Print status every 10 batches (640 files)
                if processed_count % 640 == 0:
                    elapsed = time.time() - start_time
                    rate = processed_count / elapsed
                    print(f"Processed {processed_count}/{total_files} ({rate:.1f} files/sec)...", end='\r')

    # Process remaining files in the final partial batch
    if len(batch_images) > 0:
        batch_np = np.array(batch_images)
        preds = model.predict(batch_np, verbose=0)
        for p in preds:
            predictions.append(1 if p[0] > 0.5 else 0)
        actuals.extend(batch_labels)

    # Metrics
    print("\n" + "="*30)
    if len(predictions) > 0:
        print("FINAL RESULTS")
        print("="*30)
        print(f"Accuracy:  {accuracy_score(actuals, predictions):.4f}")
        print(f"Precision: {precision_score(actuals, predictions):.4f}")
        print(f"Recall:    {recall_score(actuals, predictions):.4f}")
        print(f"F1 Score:  {f1_score(actuals, predictions):.4f}")
    else:
        print("No predictions made.")
    print("="*30)

if __name__ == "__main__":
    evaluate_model()

Loading model: deepfake_detector.h5...
Loading labels from: c:\Users\Bharat\Audio-Deepfake-Detection-using-Pretrained-Model\ASVspoof 2015 DS_10283_853\CM_protocol\cm_develop.ndx
Indexing audio files in c:\Users\Bharat\Audio-Deepfake-Detection-using-Pretrained-Model\ASVspoof 2015 DS_10283_853\wav...
Protocol has 53372 files.
Found 291931 .wav files on disk.

Starting Fast Evaluation (Batch Size: 64)...
Processed 52480/53372 (21.7 files/sec)...
FINAL RESULTS
Accuracy:  0.8812
Precision: 0.2196
Recall:    0.3186
F1 Score:  0.2600


In [5]:
import os
import numpy as np
import tensorflow as tf
import librosa
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve
from scipy.interpolate import interp1d
from scipy.optimize import brentq
import time

# --- CONFIGURATION ---
BASE_PATH = r"c:\Users\Bharat\Audio-Deepfake-Detection-using-Pretrained-Model\ASVspoof 2015 DS_10283_853"
DATASET_FOLDER = os.path.join(BASE_PATH, "wav")

# !!! TARGETING DEVELOPMENT SET AS REQUESTED !!!
PROTOCOL_FILE = os.path.join(BASE_PATH, "CM_protocol", "cm_develop.ndx")

MODEL_FILE  = "deepfake_detector.h5"
FIXED_WIDTH = 400
BATCH_SIZE  = 256  # Optimized for your 16GB RAM

def compute_eer(y_true, y_score):
    """ Computes Equal Error Rate (EER) using interpolation """
    # roc_curve returns: false positive rate, true positive rate, thresholds
    fpr, tpr, thresholds = roc_curve(y_true, y_score, pos_label=1)
    
    # EER is where False Positive Rate == False Rejection Rate (1 - TPR)
    eer = brentq(lambda x : 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
    return eer

def audio_to_spectrogram(file_path):
    try:
        y, sr = librosa.load(file_path, sr=None)
        spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
        spec_db = librosa.power_to_db(spec, ref=np.max)

        if spec_db.shape[1] < FIXED_WIDTH:
            pad_width = FIXED_WIDTH - spec_db.shape[1]
            spec_db = np.pad(spec_db, ((0, 0), (0, pad_width)), mode='constant')
        else:
            spec_db = spec_db[:, :FIXED_WIDTH]

        spec_db = spec_db[..., np.newaxis]
        spec_db = np.repeat(spec_db, 3, axis=-1)
        return spec_db
    except Exception as e:
        return None

def load_labels(protocol_path):
    labels = {}
    print(f"Loading labels from: {protocol_path}")
    with open(protocol_path, "r") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 4:
                file_name = parts[1]
                key = parts[3]
                label = 1 if key == "human" else 0
                labels[file_name] = label
    return labels

def find_all_wav_files(root_folder):
    print(f"Indexing audio files in {root_folder}...")
    file_map = {}
    for root, dirs, files in os.walk(root_folder):
        for file in files:
            if file.endswith(".wav"):
                name_no_ext = os.path.splitext(file)[0]
                file_map[name_no_ext] = os.path.join(root, file)
    return file_map

def evaluate_model():
    if not os.path.exists(MODEL_FILE):
        print("Error: deepfake_detector.h5 not found.")
        return
    
    print(f"Loading model: {MODEL_FILE}...")
    model = tf.keras.models.load_model(MODEL_FILE)

    labels = load_labels(PROTOCOL_FILE)
    file_map = find_all_wav_files(DATASET_FOLDER)

    print(f"Protocol has {len(labels)} files.")
    print(f"Found {len(file_map)} .wav files on disk.")

    print(f"\nStarting Evaluation on {os.path.basename(PROTOCOL_FILE)}...")
    start_time = time.time()
    
    # Batch Storage
    batch_images = []
    batch_labels = []
    
    # Results Storage
    predictions = []       # Binary (0 or 1)
    prediction_scores = [] # Raw Probability (0.0 to 1.0) for EER
    actuals = []           # Ground Truth
    
    total_files = len(labels)
    processed_count = 0

    for file_name, label in labels.items():
        if file_name in file_map:
            file_path = file_map[file_name]
            spec = audio_to_spectrogram(file_path)
            
            if spec is not None:
                batch_images.append(spec)
                batch_labels.append(label)

            # When batch is full, predict
            if len(batch_images) == BATCH_SIZE:
                batch_np = np.array(batch_images)
                preds = model.predict(batch_np, verbose=0)
                
                for p in preds:
                    prob = p[0]
                    prediction_scores.append(prob)       # Save raw score
                    predictions.append(1 if prob > 0.5 else 0) # Save binary
                
                actuals.extend(batch_labels)
                
                batch_images = []
                batch_labels = []
                processed_count += BATCH_SIZE
                
                if processed_count % (BATCH_SIZE * 5) == 0:
                    elapsed = time.time() - start_time
                    rate = processed_count / elapsed
                    print(f"Processed {processed_count}/{total_files} ({rate:.1f} files/sec)...", end='\r')

    # Process Final Batch
    if len(batch_images) > 0:
        batch_np = np.array(batch_images)
        preds = model.predict(batch_np, verbose=0)
        for p in preds:
            prob = p[0]
            prediction_scores.append(prob)
            predictions.append(1 if prob > 0.5 else 0)
        actuals.extend(batch_labels)

    # --- METRICS & EER ---
    print("\n" + "="*30)
    if len(predictions) > 0:
        print("FINAL RESULTS")
        print("="*30)
        
        # Calculate EER
        try:
            eer = compute_eer(actuals, prediction_scores)
            print(f"EER:       {eer * 100:.2f}%  <--- LOWER IS BETTER")
        except Exception as e:
            print(f"EER Error: {e}")

        print(f"Accuracy:  {accuracy_score(actuals, predictions):.4f}")
        print(f"Precision: {precision_score(actuals, predictions):.4f}")
        print(f"Recall:    {recall_score(actuals, predictions):.4f}")
        print(f"F1 Score:  {f1_score(actuals, predictions):.4f}")
    else:
        print("No predictions made.")
    print("="*30)

if __name__ == "__main__":
    evaluate_model()

Loading model: deepfake_detector.h5...
Loading labels from: c:\Users\Bharat\Audio-Deepfake-Detection-using-Pretrained-Model\ASVspoof 2015 DS_10283_853\CM_protocol\cm_develop.ndx
Indexing audio files in c:\Users\Bharat\Audio-Deepfake-Detection-using-Pretrained-Model\ASVspoof 2015 DS_10283_853\wav...
Protocol has 53372 files.
Found 291931 .wav files on disk.

Starting Evaluation on cm_develop.ndx...
Processed 52480/53372 (19.0 files/sec)...
FINAL RESULTS
EER:       26.79%  <--- LOWER IS BETTER
Accuracy:  0.8812
Precision: 0.2196
Recall:    0.3186
F1 Score:  0.2600


4. Evaluate on Evaluation data

In [4]:
# (with Batching)

import os
import numpy as np
import tensorflow as tf
import librosa
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve
from scipy.interpolate import interp1d
from scipy.optimize import brentq
import time

# --- CONFIGURATION ---
BASE_PATH = r"c:\Users\Bharat\Audio-Deepfake-Detection-using-Pretrained-Model\ASVspoof 2015 DS_10283_853"
DATASET_FOLDER = os.path.join(BASE_PATH, "wav")

# !!! IMPORTANT !!!
# Use 'cm_develop.ndx' to check EER on the Practice Set (Development)
# Use 'cm_evaluation.ndx' for the Final Exam (Evaluation)
PROTOCOL_FILE = os.path.join(BASE_PATH, "CM_protocol", "cm_evaluation.ndx")

MODEL_FILE  = "deepfake_detector.h5"
FIXED_WIDTH = 400
BATCH_SIZE  = 256  # Kept high for speed

def compute_eer(y_true, y_score):
    """ Computes Equal Error Rate (EER) """
    fpr, tpr, thresholds = roc_curve(y_true, y_score, pos_label=1)
    eer = brentq(lambda x : 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
    return eer

def audio_to_spectrogram(file_path):
    try:
        y, sr = librosa.load(file_path, sr=None)
        spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
        spec_db = librosa.power_to_db(spec, ref=np.max)

        if spec_db.shape[1] < FIXED_WIDTH:
            pad_width = FIXED_WIDTH - spec_db.shape[1]
            spec_db = np.pad(spec_db, ((0, 0), (0, pad_width)), mode='constant')
        else:
            spec_db = spec_db[:, :FIXED_WIDTH]

        spec_db = spec_db[..., np.newaxis]
        spec_db = np.repeat(spec_db, 3, axis=-1)
        return spec_db
    except Exception as e:
        return None

def load_labels(protocol_path):
    labels = {}
    print(f"Loading labels from: {protocol_path}")
    with open(protocol_path, "r") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 4:
                file_name = parts[1]
                key = parts[3]
                label = 1 if key == "human" else 0
                labels[file_name] = label
    return labels

def find_all_wav_files(root_folder):
    print(f"Indexing audio files in {root_folder}...")
    file_map = {}
    for root, dirs, files in os.walk(root_folder):
        for file in files:
            if file.endswith(".wav"):
                name_no_ext = os.path.splitext(file)[0]
                file_map[name_no_ext] = os.path.join(root, file)
    return file_map

def evaluate_model():
    if not os.path.exists(MODEL_FILE):
        print("Error: deepfake_detector.h5 not found.")
        return
    
    print(f"Loading model: {MODEL_FILE}...")
    model = tf.keras.models.load_model(MODEL_FILE)

    labels = load_labels(PROTOCOL_FILE)
    file_map = find_all_wav_files(DATASET_FOLDER)

    print(f"Protocol has {len(labels)} files.")
    print(f"Found {len(file_map)} .wav files on disk.")

    print(f"\nStarting Evaluation on {os.path.basename(PROTOCOL_FILE)}...")
    start_time = time.time()
    
    # Storage
    batch_images = []
    batch_labels = []
    
    predictions = []       # Binary 0/1
    prediction_scores = [] # Raw probabilities
    actuals = []           # Ground truth
    
    total_files = len(labels)
    processed_count = 0

    for file_name, label in labels.items():
        if file_name in file_map:
            file_path = file_map[file_name]
            spec = audio_to_spectrogram(file_path)
            
            if spec is not None:
                batch_images.append(spec)
                batch_labels.append(label)

            # Process Batch
            if len(batch_images) == BATCH_SIZE:
                batch_np = np.array(batch_images)
                preds = model.predict(batch_np, verbose=0)
                
                for p in preds:
                    prob = p[0]
                    prediction_scores.append(prob)       # Save raw score for EER
                    predictions.append(1 if prob > 0.5 else 0) # Save binary label
                
                actuals.extend(batch_labels)
                
                batch_images = []
                batch_labels = []
                processed_count += BATCH_SIZE
                
                if processed_count % (BATCH_SIZE * 10) == 0:
                    elapsed = time.time() - start_time
                    rate = processed_count / elapsed
                    print(f"Processed {processed_count}/{total_files} ({rate:.1f} files/sec)...", end='\r')

    # Process Final Batch
    if len(batch_images) > 0:
        batch_np = np.array(batch_images)
        preds = model.predict(batch_np, verbose=0)
        for p in preds:
            prob = p[0]
            prediction_scores.append(prob)
            predictions.append(1 if prob > 0.5 else 0)
        actuals.extend(batch_labels)

    # Metrics
    print("\n" + "="*30)
    if len(predictions) > 0:
        # Calculate EER
        try:
            eer = compute_eer(actuals, prediction_scores)
            eer_percent = eer * 100
        except Exception as e:
            eer_percent = -1
            print(f"Could not compute EER: {e}")

        print("FINAL RESULTS")
        print("="*30)
        print(f"Accuracy:  {accuracy_score(actuals, predictions):.4f}")
        print(f"Precision: {precision_score(actuals, predictions):.4f}")
        print(f"Recall:    {recall_score(actuals, predictions):.4f}")
        print(f"F1 Score:  {f1_score(actuals, predictions):.4f}")
        print("-" * 30)
        print(f"EER:       {eer_percent:.2f}%")
    else:
        print("No predictions made.")
    print("="*30)

if __name__ == "__main__":
    evaluate_model()

Loading model: deepfake_detector.h5...
Loading labels from: c:\Users\Bharat\Audio-Deepfake-Detection-using-Pretrained-Model\ASVspoof 2015 DS_10283_853\CM_protocol\cm_evaluation.ndx
Indexing audio files in c:\Users\Bharat\Audio-Deepfake-Detection-using-Pretrained-Model\ASVspoof 2015 DS_10283_853\wav...
Protocol has 193404 files.
Found 291931 .wav files on disk.

Starting Evaluation on cm_evaluation.ndx...
Processed 192000/193404 (19.8 files/sec)...
FINAL RESULTS
Accuracy:  0.8581
Precision: 0.1425
Recall:    0.3823
F1 Score:  0.2076
------------------------------
EER:       29.63%
