In [2]:
import librosa
import numpy as np
import torch
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model

# Initialize Wav2Vec processor and model for embeddings
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-large-xlsr-53")
model_wav2vec = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-xlsr-53")

def extract_mfcc(file_path, n_mfcc=13):
    y, sr = librosa.load(file_path, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    return np.mean(mfcc.T, axis=0)  # Taking the mean to get a fixed size feature vector

def extract_mel_spectrogram(file_path, n_mels=128):
    y, sr = librosa.load(file_path, sr=None)
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
    return mel_spectrogram_db

def extract_wav2vec_large_features(file_path):
    y, sr = librosa.load(file_path, sr=16000)  # Wav2Vec expects a 16kHz sampling rate
    input_values = feature_extractor(y, sampling_rate=16000, return_tensors="pt").input_values
    with torch.no_grad():
        embeddings = model_wav2vec(input_values).last_hidden_state
    return embeddings.mean(dim=1).squeeze().numpy()  # Use the mean of embeddings for a fixed-size vector


In [7]:
import numpy as np
import librosa
from tensorflow.keras.models import load_model
import tensorflow as tf


# Define expected feature dimensions
num_mfcc_features = 13  # Adjust based on your MFCC extraction configuration
mel_spec_height = 128   # Expected height of Mel spectrogram
mel_spec_width = 63     # Expected width of Mel spectrogram
num_embeddings = 1024   # Expected size of Wav2Vec embeddings

# Function to pad or truncate Mel spectrogram to fixed width
def pad_or_truncate_mel_spec(mel_spec, target_height=128, target_width=63):
    if mel_spec.shape[1] > target_width:  # Truncate if wider than target
        mel_spec = mel_spec[:, :target_width]
    else:  # Pad if narrower than target
        padding = target_width - mel_spec.shape[1]
        mel_spec = np.pad(mel_spec, ((0, 0), (0, padding)), mode='constant')
    return mel_spec

def preprocess_audio_for_prediction(file_path):
    mfcc_features = extract_mfcc(file_path)  # Shape: (num_mfcc_features,)
    mel_spectrogram_features = extract_mel_spectrogram(file_path)
    wav2vec_features = extract_wav2vec_large_features(file_path)  # Shape: (num_embeddings,)

    # Pad or truncate Mel spectrogram to (128, 63)
    mel_spectrogram_features = pad_or_truncate_mel_spec(mel_spectrogram_features, target_height=mel_spec_height, target_width=mel_spec_width)

    # Reshape features for the model input
    mel_spectrogram_features = mel_spectrogram_features.reshape((1, mel_spec_height, mel_spec_width, 1))  # Add channel dimension
    mfcc_features = mfcc_features.reshape((1, num_mfcc_features))
    wav2vec_features = wav2vec_features.reshape((1, num_embeddings))
    
    return mfcc_features, mel_spectrogram_features, wav2vec_features

# Load and test the model on a new audio sample
sample_audio_file = r"C:\Users\ACER\Desktop\AudioDF\audio\fake\file24.wav_16k.wav_norm.wav_mono.wav_silence.wav_2sec.wav"
X_mfcc, X_mel, X_wav2vec = preprocess_audio_for_prediction(sample_audio_file)

#tf.keras.saving.register_keras_serializable()
def focal_loss_fixed(y_true, y_pred, gamma=2.0, alpha=0.25):
    y_pred = tf.clip_by_value(y_pred, 1e-7, 1 - 1e-7)
    loss = -y_true * alpha * tf.pow(1 - y_pred, gamma) * tf.math.log(y_pred) - (1 - y_true) * (1 - alpha) * tf.pow(y_pred, gamma) * tf.math.log(1 - y_pred)
    return tf.reduce_mean(loss)
# Load the model, specify custom_objects
model = load_model('audio_deepfake_finetune_model1.h5', custom_objects={'focal_loss_fixed': focal_loss_fixed})

# Re-save the model in `.keras` format
#model.save('best_model.keras')

#model = load_model('best_model.keras', custom_objects={'focal_loss_fixed': focal_loss_fixed})

# Make a prediction
prediction = model.predict([X_mfcc, X_mel, X_wav2vec])

# Interpret and print the result
if prediction[0] > 0.5:
    print("Prediction: Fake audio")
else:
    print("Prediction: Real audio")




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 582ms/step
Prediction: Fake audio
