In [6]:
from tensorflow.keras.models import load_model

# Load the previously trained model
model = load_model('audio_deepfake_model.h5')



In [8]:
import librosa
import numpy as np
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2Model

def load_and_preprocess_audio(file_path, target_sr=16000):
    # Load audio with Librosa, resampling to target_sr
    audio, sr = librosa.load(file_path, sr=target_sr)
    
    # Trim leading and trailing silence
    audio, _ = librosa.effects.trim(audio)
    
    # Perform noise reduction (simple method by subtracting mean)
    audio = audio - np.mean(audio)
    
    return audio, sr

# Initialize Wav2Vec processor and model for embeddings
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model_wav2vec = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-960h")


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [86]:
def extract_wav2vec_large_features(file_path):
    # Load the audio file
    y, sr = librosa.load(file_path, sr=16000)  # Ensure the sampling rate is 16kHz
    
    # Prepare input
    input_values = processor(y, return_tensors="pt", sampling_rate=16000)

    # Check the structure of input_values
    print("Input values keys:", input_values.keys())  # Debugging line to see what keys are returned

    # Get features
    with torch.no_grad():
        outputs = model(**input_values)  # Use double asterisks to unpack the dictionary
        features = outputs.last_hidden_state.mean(dim=1)  # Average pooling over the sequence length

    return features.squeeze().numpy()  # Return as a numpy array


In [88]:
import librosa

def extract_mfcc(file_path, sr=16000, n_mfcc=13):
    audio, _ = librosa.load(file_path, sr=sr)
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    return mfccs.mean(axis=1)  # Averaging MFCCs over time


In [90]:
def extract_mel_spectrogram(file_path, sr=16000):
    audio, _ = librosa.load(file_path, sr=sr)
    mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr)
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)  # Convert to dB
    return mel_spectrogram_db


In [92]:
'''
def augment_audio(audio, sr, pitch_shift_steps=2, time_stretch_rate=1.2):
    # Pitch shifting
    augmented_audio = librosa.effects.pitch_shift(y=audio, sr=sr, n_steps=pitch_shift_steps)
    
    # Time stretching
    augmented_audio = librosa.effects.time_stretch(augmented_audio, rate=time_stretch_rate)
    
    return augmented_audio
'''

'\ndef augment_audio(audio, sr, pitch_shift_steps=2, time_stretch_rate=1.2):\n    # Pitch shifting\n    augmented_audio = librosa.effects.pitch_shift(y=audio, sr=sr, n_steps=pitch_shift_steps)\n    \n    # Time stretching\n    augmented_audio = librosa.effects.time_stretch(augmented_audio, rate=time_stretch_rate)\n    \n    return augmented_audio\n'

In [94]:
def preprocess_and_extract_features(file_path, augment=False):
    mfcc_features = extract_mfcc(file_path)
    mel_spectrogram_features = extract_mel_spectrogram(file_path)
    wav2vec_features = extract_wav2vec_large_features(file_path)  # Pass file_path directly

    return mfcc_features, mel_spectrogram_features, wav2vec_features

In [96]:
import os
import numpy as np
'''
def preprocess_and_extract_features(file_path, augment=False):
    # Load audio
    audio, sr = librosa.load(file_path, sr=16000)
    
    # Optionally augment audio
    if augment:
        audio = augment_audio(audio, sr)  # Ensure this function is defined

    # Extract features
    mfcc_features = extract_mfcc(file_path)
    mel_spectrogram_features = extract_mel_spectrogram(file_path)
    wav2vec_features = extract_wav2vec_large_features(audio)  # Updated to include padding

    return mfcc_features, mel_spectrogram_features, wav2vec_features
'''
    
def extract_features_from_dataset(dataset_path, augment=False):
    mfcc_features_list = []
    mel_spectrogram_features_list = []
    wav2vec_features_list = []
    labels = []

    for folder_name in os.listdir(dataset_path):
        folder_path = os.path.join(dataset_path, folder_name)
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.wav'):  # Ensure you only process .wav files
                file_path = os.path.join(folder_path, file_name)
                mfcc, mel_spectrogram, wav2vec = preprocess_and_extract_features(file_path, augment=augment)
                
                mfcc_features_list.append(mfcc)
                mel_spectrogram_features_list.append(mel_spectrogram)
                wav2vec_features_list.append(wav2vec)
                
                # Append labels based on folder name
                if folder_name.lower() == 'real':
                    labels.append(1)  # Real
                else:
                    labels.append(0)  # Fake

    return np.array(mfcc_features_list), np.array(mel_spectrogram_features_list), np.array(wav2vec_features_list), np.array(labels)

# Usage
dataset_path = r'C:\Users\ACER\Desktop\ASV 2015\Train'  # Adjust path accordingly
mfcc_features, mel_spectrogram_features, wav2vec_features, labels = extract_features_from_dataset(dataset_path, augment=True)
print("MFCC Features shape:", mfcc_features.shape)
print("Mel Spectrogram Features shape:", mel_spectrogram_features.shape)
print("Wav2Vec Features shape:", wav2vec_features.shape)
print("Labels shape:", labels.shape)


Input values keys: dict_keys(['input_values'])


TypeError: missing a required argument: 'inputs'

In [4]:
import numpy as np
import os

def pad_or_truncate_spectrogram(mel_spec, target_width=44):
    """Pad or truncate the Mel spectrogram to the target width."""
    current_width = mel_spec.shape[1]
    
    if current_width < target_width:
        # Pad with zeros on the right
        padding = target_width - current_width
        mel_spec = np.pad(mel_spec, ((0, 0), (0, padding)), mode='constant', constant_values=0)
    elif current_width > target_width:
        # Truncate the spectrogram
        mel_spec = mel_spec[:, :target_width]
    
    return mel_spec

def extract_features_from_new_dataset(dataset_path):
    mfcc_features, mel_spectrogram_features, wav2vec_features, labels = [], [], [], []

    for label in ['real', 'fake']:
        folder_path = os.path.join(dataset_path, label)
        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)
            mfcc = extract_mfcc(file_path)
            mel_spec = extract_mel_spectrogram(file_path)  # Get the raw Mel spectrogram
            print(f"Raw Mel spectrogram shape for {file_name}: {mel_spec.shape}")  # Print the shape
            
            # Pad or truncate the Mel spectrogram
            mel_spec = pad_or_truncate_spectrogram(mel_spec, target_width=44)
            mel_spec = np.expand_dims(mel_spec, axis=-1)  # Add channel dimension

            wav2vec = extract_wav2vec_large_features(file_path)

            mfcc_features.append(mfcc)
            mel_spectrogram_features.append(mel_spec)
            wav2vec_features.append(wav2vec)
            labels.append(0 if label == 'real' else 1)

    return np.array(mfcc_features), np.array(mel_spectrogram_features), np.array(wav2vec_features), np.array(labels)

# Use the function to extract features from the new dataset
new_dataset_path = r'C:\Users\ACER\Desktop\ASV 2015\Train'
X_mfcc_new, X_mel_new, X_wav2vec_new, y_new = extract_features_from_new_dataset(new_dataset_path)

# Now X_mel_new should be shaped (num_samples, 128, 44, 1)
print("New MFCC shape:", X_mfcc_new.shape)
print("New Mel spectrogram shape:", X_mel_new.shape)
print("New Wav2Vec shape:", X_wav2vec_new.shape)
print("New Labels shape:", y_new.shape)


Keyword argument `sampling_rate` is not a valid argument for this processor and will be ignored.


Raw Mel spectrogram shape for T13_1002558.wav: (128, 98)
Raw Mel spectrogram shape for T13_1002560.wav: (128, 77)
Raw Mel spectrogram shape for T13_1002562.wav: (128, 201)
Raw Mel spectrogram shape for T13_1002564.wav: (128, 97)
Raw Mel spectrogram shape for T13_1002567.wav: (128, 89)
Raw Mel spectrogram shape for T13_1002573.wav: (128, 87)
Raw Mel spectrogram shape for T13_1002584.wav: (128, 122)
Raw Mel spectrogram shape for T13_1002587.wav: (128, 61)
Raw Mel spectrogram shape for T13_1002589.wav: (128, 67)
Raw Mel spectrogram shape for T13_1002599.wav: (128, 75)
Raw Mel spectrogram shape for T13_1002603.wav: (128, 127)
Raw Mel spectrogram shape for T13_1002654.wav: (128, 66)
Raw Mel spectrogram shape for T13_1002688.wav: (128, 107)
Raw Mel spectrogram shape for T16_1002857.wav: (128, 99)
Raw Mel spectrogram shape for T16_1002861.wav: (128, 127)
Raw Mel spectrogram shape for T16_1002863.wav: (128, 129)
Raw Mel spectrogram shape for T16_1002864.wav: (128, 151)
Raw Mel spectrogram shap

In [12]:
import os

def get_audio_files(dataset_path):
    audio_files = []
    # Loop through each subdirectory in the dataset
    for root, _, files in os.walk(dataset_path):
        for file in files:
            # Check if the file is a .wav file
            if file.endswith('.wav'):
                audio_files.append(os.path.join(root, file))
    return audio_files

# Specify the path to your new dataset
new_dataset_path = r'C:\Users\ACER\Desktop\ASV 2015\Train'
file_list = get_audio_files(new_dataset_path)


In [14]:
def pad_mel_spectrogram(mel_spectrogram, target_width=44):
    # Check the current width
    current_width = mel_spectrogram.shape[1]
    if current_width < target_width:
        # Pad with zeros on the right
        padding_width = target_width - current_width
        padded_mel = np.pad(mel_spectrogram, ((0, 0), (0, padding_width)), mode='constant')
    elif current_width > target_width:
        # Crop the mel spectrogram
        padded_mel = mel_spectrogram[:, :target_width]
    else:
        padded_mel = mel_spectrogram
    return padded_mel.reshape((128, target_width, 1))  # Reshape to (128, target_width, 1)

# Apply the padding function in your extraction loop
mel_spectrogram_features = []

for file in file_list:  # Your list of audio files
    mel_spec = extract_mel_spectrogram(file)
    mel_spec_padded = pad_mel_spectrogram(mel_spec)
    mel_spectrogram_features.append(mel_spec_padded)


In [18]:
# Load your previously saved model
from tensorflow.keras.models import load_model

# Load the model
model = load_model('audio_deepfake_model.h5')

# Train the model on the new dataset
history = model.fit(
    [X_mfcc_new, X_mel_new, X_wav2vec_new],  # Input features
    y_new,  # Target labels
    epochs=50,  # Adjust the number of epochs as necessary
    batch_size=32,  # Adjust batch size as necessary
    validation_split=0.2,  # Use 20% of the data for validation
    verbose=1  # Set to 1 to see the training progress
)

# Save the retrained model
model.save('retrained_audio_deepfake_model.h5')




Epoch 1/50


ValueError: Input 1 of layer "functional_1" is incompatible with the layer: expected shape=(None, 128, 63, 1), found shape=(None, 128, 44)