In [2]:
import os
import torch
import librosa
import numpy as np
from sklearn.metrics import roc_auc_score, roc_curve
import torch.nn as nn

In [3]:
#GPU device
device = 'cuda' if torch.cuda.is_available() else 'cpu'                  
print('Device: {}'.format(device))

# Load the SSL W2V model trained for LA and DF tracks
from model import Model

model = Model(None, device=device)
model = nn.DataParallel(model).to(device)
model.load_state_dict(torch.load('finetuned_model.pth'))

model.eval()

Device: cuda


2024-05-03 21:52:49 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX


DataParallel(
  (module): Model(
    (ssl_model): SSLModel(
      (model): Wav2Vec2Model(
        (feature_extractor): ConvFeatureExtractionModel(
          (conv_layers): ModuleList(
            (0): Sequential(
              (0): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
              (1): Dropout(p=0.0, inplace=False)
              (2): Sequential(
                (0): TransposeLast()
                (1): Fp32LayerNorm((512,), eps=1e-05, elementwise_affine=True)
                (2): TransposeLast()
              )
              (3): GELU(approximate='none')
            )
            (1-4): 4 x Sequential(
              (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
              (1): Dropout(p=0.0, inplace=False)
              (2): Sequential(
                (0): TransposeLast()
                (1): Fp32LayerNorm((512,), eps=1e-05, elementwise_affine=True)
                (2): TransposeLast()
              )
              (3): GELU(approximate='none')
            )
     

In [5]:
def pad(x, max_len=64600):
    x_len = x.shape[0]
    if x_len >= max_len:
        return x[:max_len]
    # need to pad
    num_repeats = int(max_len / x_len)+1
    padded_x = np.tile(x, (1, num_repeats))[:, :max_len][0]
    return padded_x	

# Define a function to preprocess the audio samples
def preprocess_audio(audio_path):
    # Check file extension
    _, ext = os.path.splitext(audio_path)
    if ext.lower() not in ('.mp3', '.wav'):
        # Skip processing if file extension is not .mp3 or .wav
        return None
    
    # Load the audio file and extract features
    audio, sr = librosa.load(audio_path, sr=None)
    audio = pad(audio)
    # Here you can apply further preprocessing if needed, e.g., feature extraction
    return audio


# Define paths to real and fake audio samples
real_audio_dir = r"Dataset_Speech_Assignment/Real"
fake_audio_dir = r"Dataset_Speech_Assignment/Fake"

# Collect paths to real and fake audio files
real_audio_paths = [os.path.join(real_audio_dir, filename) for filename in os.listdir(real_audio_dir)]
fake_audio_paths = [os.path.join(fake_audio_dir, filename) for filename in os.listdir(fake_audio_dir)]

In [6]:
from sklearn.metrics import roc_auc_score, roc_curve
import numpy as np
from tqdm import tqdm

# Collect predictions and ground truth labels
predictions = []
ground_truth = []

# Use tqdm to add a progress bar
for audio_path in tqdm(real_audio_paths + fake_audio_paths):
    try:
        processed_audio = preprocess_audio(audio_path)
        
        # Skip processing if preprocess_audio returns None
        if processed_audio is None:
            continue
        
        processed_audio_tensor = torch.tensor(processed_audio, dtype=torch.float32).unsqueeze(0)
        
        with torch.no_grad():
            output = model(processed_audio_tensor)
            # Move the tensor from CUDA device to CPU
            output_cpu = output.cpu()

            # Extract the second value (index 1) and convert it to a Python scalar
            prediction = output_cpu[0][1].item()
            
        predictions.append(prediction)
        
        # Add ground truth label
        if audio_path in real_audio_paths:
            ground_truth.append(0)  # 0 for real
        else:
            ground_truth.append(1)  # 1 for fake
            
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        continue

# Convert lists to numpy arrays
predictions = np.array(predictions)
ground_truth = np.array(ground_truth)

# Calculate AUC
auc_score = roc_auc_score(ground_truth, predictions)

print("AUC:", auc_score)


  audio, sr = librosa.load(audio_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
 68%|██████▊   | 205/301 [00:07<00:02, 33.50it/s]

Error processing Dataset_Speech_Assignment/Fake/ayla_ja.wav: 


 72%|███████▏  | 218/301 [00:07<00:02, 36.34it/s]

Error processing Dataset_Speech_Assignment/Fake/carla_pt.wav: 
Error processing Dataset_Speech_Assignment/Fake/carla_en.wav: 


 79%|███████▉  | 238/301 [00:08<00:02, 30.23it/s]

Error processing Dataset_Speech_Assignment/Fake/carla_de.wav: 


 82%|████████▏ | 246/301 [00:08<00:01, 31.64it/s]

Error processing Dataset_Speech_Assignment/Fake/ayla_en.wav: 


 86%|████████▌ | 258/301 [00:09<00:01, 31.20it/s]

Error processing Dataset_Speech_Assignment/Fake/ayla_fr.wav: 


100%|██████████| 301/301 [00:10<00:00, 27.60it/s]

AUC: 0.16505847953216374





In [7]:
# Calculate AUC
auc_score = roc_auc_score(ground_truth, predictions)

# Calculate ROC curve
fpr, tpr, _ = roc_curve(ground_truth, predictions)

# Find the point on the ROC curve where FPR equals 1 - TPR
eer = 1.0
for i in range(len(fpr)):
    if fpr[i] >= 1 - tpr[i]:
        eer = fpr[i]
        break

# Analyze the performance of the model
print("AUC:", auc_score)
print("EER:", eer)

AUC: 0.16505847953216374
EER: 0.7333333333333333
