# Data Preparation and Exploration
Load the audio files from the training, validation, and testing directories, separating real and fake samples. Normalize the audio data to a consistent format (e.g., sampling rate, bit depth). Augment the dataset to increase diversity (e.g., time stretching, pitch shifting, noise addition). Split the dataset into training, validation, and testing sets. Visualize audio waveforms, spectrograms, and MFCCs for both real and fake audio samples. Analyze the statistical properties of the dataset (e.g., mean, standard deviation, distribution).

In [None]:
# Import necessary libraries
import os
import tarfile
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import IPython.display as ipd
from sklearn.model_selection import train_test_split

# Download and extract the dataset
!wget https://www.eecs.yorku.ca/~bil/Datasets/for-rerec.tar.gz
with tarfile.open('for-rerec.tar.gz', 'r:gz') as tar:
    tar.extractall()

# Define paths to the dataset directories
train_real_dir = 'for-rerecorded/training/real'
train_fake_dir = 'for-rerecorded/training/fake'
val_real_dir = 'for-rerecorded/validation/real'
val_fake_dir = 'for-rerecorded/validation/fake'
test_real_dir = 'for-rerecorded/testing/real'
test_fake_dir = 'for-rerecorded/testing/fake'

# Function to load audio files
def load_audio_files(directory):
    audio_files = []
    labels = []
    for filename in os.listdir(directory):
        if filename.endswith('.wav') or filename.endswith('.mp3'):
            filepath = os.path.join(directory, filename)
            audio, sr = librosa.load(filepath, sr=None)
            audio_files.append(audio)
            labels.append(1 if 'real' in directory else 0)
    return audio_files, labels

# Load real and fake audio files
train_real_audio, train_real_labels = load_audio_files(train_real_dir)
train_fake_audio, train_fake_labels = load_audio_files(train_fake_dir)
val_real_audio, val_real_labels = load_audio_files(val_real_dir)
val_fake_audio, val_fake_labels = load_audio_files(val_fake_dir)
test_real_audio, test_real_labels = load_audio_files(test_real_dir)
test_fake_audio, test_fake_labels = load_audio_files(test_fake_dir)

# Combine real and fake audio data
train_audio_data = train_real_audio + train_fake_audio
train_labels = train_real_labels + train_fake_labels
val_audio_data = val_real_audio + val_fake_audio
val_labels = val_real_labels + val_fake_labels
test_audio_data = test_real_audio + test_fake_audio
test_labels = test_real_labels + test_fake_labels

# Normalize audio data
def normalize_audio(audio, target_sr=16000):
    return librosa.resample(audio, orig_sr=librosa.get_samplerate(audio), target_sr=target_sr)

normalized_train_audio = [normalize_audio(audio) for audio in train_audio_data]
normalized_val_audio = [normalize_audio(audio) for audio in val_audio_data]
normalized_test_audio = [normalize_audio(audio) for audio in test_audio_data]

# Data augmentation functions
def time_stretch(audio, rate=1.1):
    return librosa.effects.time_stretch(audio, rate)

def pitch_shift(audio, sr, n_steps=2):
    return librosa.effects.pitch_shift(audio, sr, n_steps)

def add_noise(audio, noise_factor=0.005):
    noise = np.random.randn(len(audio))
    return audio + noise_factor * noise

# Augment the dataset
augmented_train_audio = []
augmented_train_labels = []
for audio, label in zip(normalized_train_audio, train_labels):
    augmented_train_audio.append(audio)
    augmented_train_labels.append(label)
    augmented_train_audio.append(time_stretch(audio))
    augmented_train_labels.append(label)
    augmented_train_audio.append(pitch_shift(audio, sr=16000))
    augmented_train_labels.append(label)
    augmented_train_audio.append(add_noise(audio))
    augmented_train_labels.append(label)

# Split the dataset into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(augmented_train_audio, augmented_train_labels, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Visualize audio waveforms, spectrograms, and MFCCs
def visualize_audio(audio, sr=16000):
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 3, 1)
    librosa.display.waveshow(audio, sr=sr)
    plt.title('Waveform')
    
    plt.subplot(1, 3, 2)
    D = librosa.amplitude_to_db(np.abs(librosa.stft(audio)), ref=np.max)
    librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Spectrogram')
    
    plt.subplot(1, 3, 3)
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    librosa.display.specshow(mfccs, sr=sr, x_axis='time')
    plt.colorbar()
    plt.title('MFCC')
    
    plt.tight_layout()
    plt.show()

# Visualize a sample of real and fake audio
print("Real Audio Sample:")
visualize_audio(X_train[0])

print("Fake Audio Sample:")
visualize_audio(X_train[len(train_real_audio)])

# Analyze statistical properties of the dataset
def analyze_statistics(audio_data):
    lengths = [len(audio) for audio in audio_data]
    mean_length = np.mean(lengths)
    std_length = np.std(lengths)
    print(f"Mean length: {mean_length}")
    print(f"Standard deviation of length: {std_length}")
    plt.hist(lengths, bins=50)
    plt.title('Distribution of Audio Lengths')
    plt.xlabel('Length')
    plt.ylabel('Frequency')
    plt.show()

analyze_statistics(X_train)

# Feature Extraction
Utilize MobileNet as the backbone for feature extraction. Extract relevant audio features such as Mel-Frequency Cepstral Coefficients (MFCCs), spectral centroids, and chromagrams. Experiment with different window sizes, hop lengths, and MFCC coefficient counts to optimize feature extraction. Visualize the extracted features to understand their discriminative power.

In [None]:
# Feature Extraction

import tensorflow as tf
from tensorflow.keras.applications import MobileNet
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, GlobalAveragePooling2D

# Function to extract MFCCs
def extract_mfcc(audio, sr=16000, n_mfcc=13, hop_length=512, n_fft=2048):
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc, hop_length=hop_length, n_fft=n_fft)
    return mfccs

# Function to extract spectral centroids
def extract_spectral_centroids(audio, sr=16000, hop_length=512):
    spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=sr, hop_length=hop_length)
    return spectral_centroids

# Function to extract chromagrams
def extract_chromagram(audio, sr=16000, hop_length=512):
    chromagram = librosa.feature.chroma_stft(y=audio, sr=sr, hop_length=hop_length)
    return chromagram

# Extract features from the dataset
def extract_features(audio_data):
    mfcc_features = []
    spectral_centroid_features = []
    chromagram_features = []
    
    for audio in audio_data:
        mfcc_features.append(extract_mfcc(audio))
        spectral_centroid_features.append(extract_spectral_centroids(audio))
        chromagram_features.append(extract_chromagram(audio))
    
    return mfcc_features, spectral_centroid_features, chromagram_features

mfcc_features, spectral_centroid_features, chromagram_features = extract_features(X_train)

# Visualize extracted features
def visualize_features(mfcc, spectral_centroid, chromagram):
    plt.figure(figsize=(15, 5))
    
    plt.subplot(1, 3, 1)
    librosa.display.specshow(mfcc, x_axis='time')
    plt.colorbar()
    plt.title('MFCC')
    
    plt.subplot(1, 3, 2)
    plt.semilogy(spectral_centroid.T, label='Spectral Centroid')
    plt.ylabel('Hz')
    plt.xticks([])
    plt.xlim([0, spectral_centroid.shape[-1]])
    plt.legend(loc='upper right')
    plt.title('Spectral Centroid')
    
    plt.subplot(1, 3, 3)
    librosa.display.specshow(chromagram, y_axis='chroma', x_axis='time')
    plt.colorbar()
    plt.title('Chromagram')
    
    plt.tight_layout()
    plt.show()

# Visualize a sample of extracted features
visualize_features(mfcc_features[0], spectral_centroid_features[0], chromagram_features[0])

# Utilize MobileNet as the backbone for feature extraction
def create_mobilenet_feature_extractor(input_shape):
    base_model = MobileNet(weights='imagenet', include_top=False, input_shape=input_shape)
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    model = Model(inputs=base_model.input, outputs=x)
    return model

# Example usage of MobileNet feature extractor
input_shape = (128, 128, 3)  # Example input shape, adjust as needed
mobilenet_model = create_mobilenet_feature_extractor(input_shape)

# Convert MFCC features to the shape expected by MobileNet
def preprocess_mfcc_for_mobilenet(mfcc_features):
    processed_features = []
    for mfcc in mfcc_features:
        mfcc_resized = np.resize(mfcc, input_shape[:2])
        mfcc_3d = np.stack((mfcc_resized,)*3, axis=-1)  # Convert to 3 channels
        processed_features.append(mfcc_3d)
    return np.array(processed_features)

processed_mfcc_features = preprocess_mfcc_for_mobilenet(mfcc_features)

# Extract features using MobileNet
mobilenet_features = mobilenet_model.predict(processed_mfcc_features)

# Visualize the extracted MobileNet features
plt.figure(figsize=(10, 5))
plt.plot(mobilenet_features[0])
plt.title('MobileNet Extracted Features')
plt.xlabel('Feature Index')
plt.ylabel('Feature Value')
plt.show()

# Model Training and Evaluation
Design a suitable classification model (e.g., CNN, RNN, or a hybrid) to classify audio samples as real or fake. Train the model on the extracted features. Implement appropriate loss functions (e.g., cross-entropy loss) and optimization algorithms (e.g., Adam). Fine-tune hyperparameters (e.g., learning rate, batch size, epochs) to optimize performance. Evaluate the model's performance using metrics such as accuracy, precision, recall, F1-score, and AUC-ROC. Visualize the model's learning curve, confusion matrix, and ROC curve. Incorporate XAI techniques (e.g., LIME, SHAP) to understand the model's decision-making process. Visualize feature importance and model predictions.

In [None]:
# Model Training and Evaluation

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve
import seaborn as sns

# Define the model architecture
def create_classification_model(input_shape):
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    return model

# Create the model
input_shape = (128, 128, 3)
model = create_classification_model(input_shape)

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Preprocess validation and test data
processed_mfcc_features_val = preprocess_mfcc_for_mobilenet(extract_features(X_val)[0])
processed_mfcc_features_test = preprocess_mfcc_for_mobilenet(extract_features(X_test)[0])

# Train the model
history = model.fit(processed_mfcc_features, np.array(y_train), validation_data=(processed_mfcc_features_val, np.array(y_val)), epochs=20, batch_size=32)

# Evaluate the model
y_pred = model.predict(processed_mfcc_features_test)
y_pred_classes = (y_pred > 0.5).astype(int)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred_classes)
precision = precision_score(y_test, y_pred_classes)
recall = recall_score(y_test, y_pred_classes)
f1 = f1_score(y_test, y_pred_classes)
auc_roc = roc_auc_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"AUC-ROC: {auc_roc}")

# Visualize the learning curve
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_classes)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# ROC curve
fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, marker='.')
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

# Save and download the model
model.save('deepfake_audio_detection_model.h5')
from google.colab import files
files.download('deepfake_audio_detection_model.h5')

# XAI Integration using LIME
import lime
import lime.lime_tabular

explainer = lime.lime_tabular.LimeTabularExplainer(processed_mfcc_features, feature_names=[f'feature_{i}' for i in range(processed_mfcc_features.shape[1])], class_names=['Fake', 'Real'], verbose=True, mode='classification')

# Explain a prediction
i = 0  # Index of the sample to explain
exp = explainer.explain_instance(processed_mfcc_features_test[i], model.predict, num_features=10)
exp.show_in_notebook(show_table=True, show_all=False)

# Model Deployment and Inference
Develop a user interface (e.g., web app, command-line tool) to allow users to input audio files. Preprocess the input audio. Extract features using the trained model. Make a prediction (real or fake). Generate a spectrogram of the input audio. Display the prediction score and classification. Save and download the model. Test the model in Colab itself. Write the main function. Use the downloaded model with a fullstack web application.

In [None]:
import gradio as gr
import tensorflow as tf
import numpy as np
import librosa
import matplotlib.pyplot as plt
import librosa.display
from tensorflow.keras.models import load_model

# Load the trained model
model = load_model('deepfake_audio_detection_model.h5')

# Function to preprocess input audio
def preprocess_input_audio(audio_path):
    audio, sr = librosa.load(audio_path, sr=16000)
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    mfcc_resized = np.resize(mfcc, (128, 128))
    mfcc_3d = np.stack((mfcc_resized,)*3, axis=-1)
    return np.array([mfcc_3d])

# Function to generate spectrogram
def generate_spectrogram(audio_path):
    audio, sr = librosa.load(audio_path, sr=16000)
    plt.figure(figsize=(10, 4))
    D = librosa.amplitude_to_db(np.abs(librosa.stft(audio)), ref=np.max)
    librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Spectrogram')
    plt.savefig('spectrogram.png')
    plt.close()
    return 'spectrogram.png'

# Function to make prediction
def predict_audio(audio_path):
    processed_audio = preprocess_input_audio(audio_path)
    prediction = model.predict(processed_audio)
    prediction_score = prediction[0][0]
    classification = 'Real' if prediction_score > 0.5 else 'Fake'
    spectrogram_path = generate_spectrogram(audio_path)
    return classification, prediction_score, spectrogram_path

# Create Gradio interface
def gradio_interface(audio):
    classification, prediction_score, spectrogram_path = predict_audio(audio.name)
    return classification, prediction_score, spectrogram_path

interface = gr.Interface(
    fn=gradio_interface,
    inputs=gr.inputs.Audio(source="upload", type="file"),
    outputs=[
        gr.outputs.Textbox(label="Classification"),
        gr.outputs.Textbox(label="Prediction Score"),
        gr.outputs.Image(type="file", label="Spectrogram")
    ],
    title="Deepfake Audio Detection",
    description="Upload an audio file to detect if it is real or fake."
)

# Launch the interface
interface.launch()

# Main function to test the model in Colab
def main():
    # Test with a sample audio file
    sample_audio_path = 'path_to_sample_audio.wav'  # Replace with actual path
    classification, prediction_score, spectrogram_path = predict_audio(sample_audio_path)
    print(f"Classification: {classification}")
    print(f"Prediction Score: {prediction_score}")
    ipd.display(ipd.Audio(sample_audio_path))
    img = plt.imread(spectrogram_path)
    plt.imshow(img)
    plt.axis('off')
    plt.show()

# Run the main function
main()