In [1]:
import os
import torchaudio

import numpy as np
import matplotlib.pyplot as plt
import librosa
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn import svm
import torch.optim as optim
from sklearn.metrics import accuracy_score


from sklearn.metrics import roc_curve, auc, roc_auc_score
from scipy.optimize import brentq
from scipy.interpolate import interp1d

# Path to the text file containing labels.
label_file_path = "/Users/zhuohangchen/Downloads/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.train.trn.txt"
# Path to the directory containing audio files.
audio_dir_path = "/Users/zhuohangchen/Downloads/LA/ASVspoof2019_LA_train/flac"

# Dictionary to store audio file labels.
audio_labels = {}

# Read the labels from the text file and store them in the dictionary.
with open(label_file_path, 'r') as file:
    for line in file:
        parts = line.strip().split()
        if len(parts) == 5 and parts[4] in ['bonafide', 'spoof']:
            audio_id = parts[1]
            label = 0 if parts[4] == 'bonafide' else 1
            audio_labels[audio_id] = label

# Dictionary 'audio_labels' where keys are audio IDs.
# and values are labels (0 for genuine, 1 for spoofed).


# List to store audio filenames and corresponding labels.
audio_filenames = []
labels = []

# Iterate through the audio directory to find the audio files.
for filename in os.listdir(audio_dir_path):
    if filename.endswith('.flac'):
        audio_id = filename.split('.')[0]
        if audio_id in audio_labels:
            audio_file_path = os.path.join(audio_dir_path, filename)
            audio_filenames.append(audio_file_path)
            labels.append(audio_labels[audio_id])
class AudioDataset(Dataset):
    def __init__(self, audio_filenames, labels):
        self.audio_filenames = audio_filenames
        self.labels = labels
        self.sr = 16000  # Sample rate for resampling.
        self.max_seq_len = 200  # Maximum sequence length for padding.

    def __len__(self):
        return len(self.audio_filenames)

    def __getitem__(self, idx):
        audio_filename = self.audio_filenames[idx]
        waveform, sample_rate = torchaudio.load(audio_filename)

        # Resample the audio waveform if needed
        if sample_rate != self.sr:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=self.sr)
            waveform = resampler(waveform)

        # Compute MFCCs using librosa.
        waveform = waveform.squeeze().numpy()  # Convert tensor to NumPy array.
        mfccs = librosa.feature.mfcc(y=waveform, sr=self.sr, n_mfcc=13)

        # Pad or truncate MFCCs to the maximum sequence length.
        if mfccs.shape[1] < self.max_seq_len:
            mfccs = np.pad(mfccs, ((0, 0), (0, self.max_seq_len - mfccs.shape[1])), mode='constant')
        else:
            mfccs = mfccs[:, :self.max_seq_len]

        # Convert MFCCs to tensor.
        mfccs_tensor = torch.tensor(mfccs)

        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return mfccs_tensor, label

    


# Create dataset and dataloader
dataset = AudioDataset(audio_filenames, labels)
data_loader = DataLoader(dataset, batch_size=len(dataset), shuffle=True)

for mfcc_batch, label_batch in data_loader:
    X = mfcc_batch.view(mfcc_batch.size(0), -1).numpy()
    y = label_batch.numpy()

# Splitting the data into training and validation sets
train_size = int(0.8 * len(X))  # 80% of data for training
X_train, X_val = X[:train_size], X[train_size:]
y_train, y_val = y[:train_size], y[train_size:]
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)


# Use SVM with Radial basis function (RBF) kernel
clf = svm.SVC(kernel='rbf', C=1)

clf.fit(X_train, y_train)

y_pred_train = clf.predict(X_train)
y_pred_val = clf.predict(X_val)


train_accuracy = accuracy_score(y_train, y_pred_train)
val_accuracy = accuracy_score(y_val, y_pred_val)

print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")



Training Accuracy: 99.26%
Validation Accuracy: 98.42%


In [2]:
def test_svm(clf, test_data_loader):
    # Collect all test data
    X_test = []
    y_test = []
    for mfcc_batch, label_batch in test_data_loader:
        mfcc_batch_np = mfcc_batch.view(mfcc_batch.size(0), -1).numpy()
        label_batch_np = label_batch.numpy()
        X_test.extend(mfcc_batch_np)
        y_test.extend(label_batch_np)
    
    X_test = np.array(X_test)
    y_test = np.array(y_test)

    # Get decision scores from the SVM
    decision_scores = clf.decision_function(X_test)
    
    # Make predictions using the SVM
    y_pred = clf.predict(X_test)
    
    # Calculate accuracy
    correct = np.sum(y_pred == y_test)
    total = len(y_test)
    accuracy = 100 * correct / total

    # Calculate ROC AUC
    roc_auc = roc_auc_score(y_test, decision_scores)
    
    # Compute EER
    fpr, tpr, thresholds = roc_curve(y_test, decision_scores)
    eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
    
    print(f"Testing Accuracy: {accuracy:.2f}%")
    print(f"ROC AUC: {roc_auc:.2f}")
    print(f"EER: {eer:.2f}")

test_audio_dir = "/Users/zhuohangchen/Downloads/LA/ASVspoof2019_LA_eval/flac"
test_labels_file = "/Users/zhuohangchen/Downloads/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.eval.trl.txt"

# Load the testing audio filenames and labels.
test_audio_filenames = []
test_labels = []

with open(test_labels_file, 'r') as file:
    for line in file:
        parts = line.strip().split()
        if len(parts) == 5 and parts[4] in ['bonafide', 'spoof']:
            audio_id = parts[1]
            label = 0 if parts[4] == 'bonafide' else 1
            audio_file_path = os.path.join(test_audio_dir, f"{audio_id}.flac")
            if os.path.exists(audio_file_path):
                test_audio_filenames.append(audio_file_path)
                test_labels.append(label)

# Create the testing dataset and data loader.
test_dataset = AudioDataset(test_audio_filenames, test_labels)
test_data_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Test the SVM on the testing data.
test_svm(clf, test_data_loader)


Testing Accuracy: 89.68%
ROC AUC: 0.50
EER: 0.50


In [3]:
def test_svm(clf, test_data_loader):
    # Collect all test data
    X_test = []
    y_test = []
    for mfcc_batch, label_batch in test_data_loader:
        mfcc_batch_np = mfcc_batch.view(mfcc_batch.size(0), -1).numpy()
        label_batch_np = label_batch.numpy()
        X_test.extend(mfcc_batch_np)
        y_test.extend(label_batch_np)
    
    X_test = np.array(X_test)
    y_test = np.array(y_test)

    # Make predictions using the SVM
    y_pred = clf.predict(X_test)
    
    # Calculate accuracy
    correct = np.sum(y_pred == y_test)
    total = len(y_test)
    accuracy = 100 * correct / total
    
    print(f"Testing Accuracy: {accuracy:.2f}%")

    
test_audio_dir = "/Users/zhuohangchen/Downloads/ASVspoof2017_V2_eval"
test_labels_file = "/Users/zhuohangchen/Downloads/protocol_V2/ASVspoof2017_V2_eval.trl.txt"

# Load the testing audio filenames and labels.
test_audio_filenames = []
test_labels = []

with open(test_labels_file, 'r') as file:
    for line in file:
        parts = line.strip().split()
        if len(parts) >= 2 and parts[1] in ['spoof', 'genuine']:
            audio_id = parts[0].split('.')[0]
            label = 0 if parts[1] == 'genuine' else 1
            audio_file_path = os.path.join(test_audio_dir, f"{audio_id}.wav")
            if os.path.exists(audio_file_path):
                test_audio_filenames.append(audio_file_path)
                test_labels.append(label)

# Create the testing dataset and data loader.
test_dataset = AudioDataset(test_audio_filenames, test_labels)
test_data_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Test the SVM on the testing data.
test_svm(clf, test_data_loader)


Testing Accuracy: 90.25%
