In [1]:
# Core packages
import os
import numpy as np
import librosa
import tensorflow as tf
from tensorflow.keras import layers, models, utils

In [2]:
# File paths and configuration
AUDIO_DIR = "LA/ASVspoof2019_LA_train/flac"
PROTO_FILE = "LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.train.trn.txt"

CLASSES = 2
SRATE = 16000
AUDIO_LEN = 5  # seconds
MEL_BANDS = 128

In [None]:
# Create label dictionary from protocol file
file_labels = {}

with open(PROTO_FILE, 'r') as proto:
    for line in proto:
        parts = line.strip().split()
        file_id = parts[1]
        label_val = 1 if parts[-1] == "bonafide" else 0
        file_labels[file_id] = label_val




In [None]:
# Feature and label preparation
spectrograms = []
targets = []

FIXED_FRAMES = 109

for file_id, tag in file_labels.items():
    path = os.path.join(AUDIO_DIR, file_id + ".flac")
    
    signal, _ = librosa.load(path, sr=SRATE, duration=AUDIO_LEN)
    mel_spec = librosa.feature.melspectrogram(y=signal, sr=SRATE, n_mels=MEL_BANDS)
    mel_db = librosa.power_to_db(mel_spec, ref=np.max)
    
    # Pad or truncate for uniform input size
    if mel_db.shape[1] < FIXED_FRAMES:
        mel_db = np.pad(mel_db, ((0, 0), (0, FIXED_FRAMES - mel_db.shape[1])), mode='constant')
    else:
        mel_db = mel_db[:, :FIXED_FRAMES]
    
    spectrograms.append(mel_db)
    targets.append(tag)

In [None]:
# Convert to numpy arrays
X_data = np.array(spectrograms)
y_data = np.array(targets)

# One-hot encode target classes
y_encoded = utils.to_categorical(y_data, num_classes=CLASSES)

In [None]:
# 80-20 training-validation split
cutoff = int(0.8 * len(X_data))

X_train, X_valid = X_data[:cutoff], X_data[cutoff:]
y_train, y_valid = y_encoded[:cutoff], y_encoded[cutoff:]

In [None]:
# CNN Architecture for Spectrogram Classification
input_dim = (MEL_BANDS, X_train.shape[2], 1)
inputs = layers.Input(shape=input_dim)

net = layers.Conv2D(32, (3, 3), activation='relu')(inputs)
net = layers.MaxPooling2D(pool_size=(2, 2))(net)
net = layers.Conv2D(64, (3, 3), activation='relu')(net)
net = layers.MaxPooling2D(pool_size=(2, 2))(net)
net = layers.Flatten()(net)
net = layers.Dense(128, activation='relu')(net)
net = layers.Dropout(0.5)(net)
outputs = layers.Dense(CLASSES, activation='softmax')(net)

model = models.Model(inputs, outputs)

In [None]:
# Compile and Train the Model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_valid, y_valid))
model.save("audio_cnn_model.h5")

In [None]:
# Load model and prepare for evaluation
from tensorflow.keras.models import load_model

TEST_AUDIO_DIR = "./TestEvaluation"
LOADED_MODEL_PATH = "audio_cnn_model.h5"

model = load_model(LOADED_MODEL_PATH)

In [None]:
# Process test set into mel spectrograms
test_inputs = []

test_audio_files = [f for f in os.listdir(TEST_AUDIO_DIR) if f.endswith(".flac")]

for fname in test_audio_files:
    fpath = os.path.join(TEST_AUDIO_DIR, fname)
    audio, _ = librosa.load(fpath, sr=SRATE, duration=AUDIO_LEN)
    
    mel = librosa.feature.melspectrogram(y=audio, sr=SRATE, n_mels=MEL_BANDS)
    mel_db = librosa.power_to_db(mel, ref=np.max)
    
    if mel_db.shape[1] < FIXED_FRAMES:
        mel_db = np.pad(mel_db, ((0, 0), (0, FIXED_FRAMES - mel_db.shape[1])), mode='constant')
    else:
        mel_db = mel_db[:, :FIXED_FRAMES]

    test_inputs.append(mel_db)

X_test = np.array(test_inputs)

In [None]:
# Predict class probabilities
test_probs = model.predict(X_test)

# Get predicted class indices
predicted_labels = np.argmax(test_probs, axis=1)

In [None]:
# Read reference labels for evaluation
label_protocol_path = "test_eval.txt"
true_targets = {}

with open(label_protocol_path, 'r') as f:
    for entry in f:
        parts = entry.strip().split()
        if len(parts) >= 2:
            true_targets[parts[0]] = parts[-1]

In [None]:
# Convert textual labels to numeric format
true_numeric = np.array([1 if lbl == "bonafide" else 0 for lbl in true_targets.values()])

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

labels_list = ["spoof", "bonafide"]
conf_mat = confusion_matrix(true_numeric, predicted_labels)

fig, ax = plt.subplots(figsize=(6, 6))
disp = ConfusionMatrixDisplay(confusion_matrix=conf_mat, display_labels=labels_list)
disp.plot(cmap="Blues", ax=ax)
plt.title("Prediction Confusion Matrix")
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc

pos_probs = test_probs[:, 1]  # Probabilities for 'bonafide'

fpr, tpr, _ = roc_curve(true_numeric, pos_probs)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f"ROC AUC = {roc_auc:.2f}")
plt.plot([0, 1], [0, 1], linestyle='--', color='navy')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve, average_precision_score

precision, recall, _ = precision_recall_curve(true_numeric, pos_probs)
avg_prec = average_precision_score(true_numeric, pos_probs)

plt.figure(figsize=(6, 5))
plt.plot(recall, precision, label=f'AP = {avg_prec:.2f}', color='darkgreen')
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend(loc='lower left')
plt.grid(True)
plt.show()

In [None]:
from sklearn.calibration import calibration_curve

prob_true, prob_pred = calibration_curve(true_numeric, pos_probs, n_bins=10)

plt.figure(figsize=(6, 5))
plt.plot(prob_pred, prob_true, marker='o', label='Model Calibration', color='purple')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Ideal')
plt.xlabel('Mean Predicted Probability')
plt.ylabel('True Fraction')
plt.title('Calibration Plot')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import seaborn as sns

plt.figure(figsize=(6, 4))
sns.countplot(x=true_numeric, palette='pastel')
plt.xticks([0, 1], ["spoof", "bonafide"])
plt.title("True Label Distribution")
plt.ylabel("Samples")
plt.xlabel("Class")
plt.grid(True)
plt.show()

In [None]:
import librosa.display

sample_files = [f for f in os.listdir(TEST_AUDIO_DIR) if f.endswith(".flac")][:5]

for audio_file in sample_files:
    audio_path = os.path.join(TEST_AUDIO_DIR, audio_file)
    audio, _ = librosa.load(audio_path, sr=SRATE, duration=AUDIO_LEN)
    
    mel = librosa.feature.melspectrogram(y=audio, sr=SRATE, n_mels=MEL_BANDS)
    mel_db = librosa.power_to_db(mel, ref=np.max)

    plt.figure(figsize=(10, 4))
    librosa.display.specshow(mel_db, x_axis='time', y_axis='mel', sr=SRATE)
    plt.colorbar(format='%+2.0f dB')
    plt.title(f"Mel Spectrogram - {audio_file}")
    plt.tight_layout()
    plt.show()

In [None]:
!pip install graphviz pydot

In [None]:
from keras.utils import plot_model

visual_model = tf.keras.models.load_model("audio_cnn_model.h5")
plot_model(visual_model, to_file='network_diagram.png', show_shapes=True, show_layer_names=True)