In [1]:
!git clone https://huggingface.co/datasets/CSALT/deepfake_detection_dataset_urdu

Cloning into 'deepfake_detection_dataset_urdu'...
remote: Enumerating objects: 6796, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 6796 (delta 0), reused 0 (delta 0), pack-reused 6793 (from 1)[K
Receiving objects: 100% (6796/6796), 957.64 KiB | 4.37 MiB/s, done.
Resolving deltas: 100% (2/2), done.
Updating files: 100% (6796/6796), done.
Filtering content: 100% (6794/6794), 1.82 GiB | 38.73 MiB/s, done.


In [2]:
import os
import numpy as np
import librosa
import pandas as pd
from sklearn.utils import shuffle
import random

# Parameters
SAMPLE_RATE = 16000  # Standard sample rate for audio
N_MFCC = 13          # Number of MFCC coefficients
MAX_FRAMES = 300     # Fixed number of frames for uniform input
N_SAMPLES_PER_CLASS = 500  # Number of samples per class (Bonafide and Deepfake)

# Function to extract MFCC features
def extract_mfcc(file_path):
    """Extract MFCC features from an audio file"""
    try:
        # Load audio file
        audio, sr = librosa.load(file_path, sr=SAMPLE_RATE)

        # Extract MFCCs
        mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=N_MFCC)

        # Pad or truncate to MAX_FRAMES
        if mfcc.shape[1] > MAX_FRAMES:
            mfcc = mfcc[:, :MAX_FRAMES]
        else:
            mfcc = np.pad(mfcc, ((0, 0), (0, MAX_FRAMES - mfcc.shape[1])), mode='constant')

        # Flatten to 1D vector
        mfcc_flat = mfcc.flatten()
        return mfcc_flat
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Function to collect file paths recursively
def collect_file_paths(dataset_dir):
    """Collect file paths and labels from dataset recursively"""
    bonafide_files = []
    deepfake_files = []

    # Check for Bonafide folder
    bonafide_dir = None
    for possible_name in ['Bonafide', 'bonafide', 'Bonafide_Audio', 'Real']:
        candidate = os.path.join(dataset_dir, possible_name)
        if os.path.isdir(candidate):
            bonafide_dir = candidate
            break

    if bonafide_dir is None:
        print("Error: Could not find Bonafide folder in", dataset_dir)
        print("Checked for: Bonafide, bonafide, Bonafide_Audio, Real")
        return [], []

    # Bonafide files (recursive search)
    print(f"Scanning Bonafide folder: {bonafide_dir}")
    for root, _, files in os.walk(bonafide_dir):
        for file in files:
            if file.lower().endswith('.wav'):
                file_path = os.path.join(root, file)
                bonafide_files.append(file_path)

    # Deepfake files (Spoofed_Tacotron and Spoofed_TTS)
    for spoof_dir_name in ['Spoofed_Tacotron', 'spoofed_tacotron', 'Spoofed_TTS', 'spoofed_tts']:
        spoof_dir = os.path.join(dataset_dir, spoof_dir_name)
        if os.path.isdir(spoof_dir):
            print(f"Scanning Deepfake folder: {spoof_dir}")
            for root, _, files in os.walk(spoof_dir):
                for file in files:
                    if file.lower().endswith('.wav'):
                        file_path = os.path.join(root, file)
                        deepfake_files.append(file_path)

    return bonafide_files, deepfake_files

# Main preprocessing function
def preprocess_dataset(dataset_dir, output_dir='preprocessed'):
    """Preprocess audio dataset and save features"""
    # Create output directory
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Collect file paths
    bonafide_files, deepfake_files = collect_file_paths(dataset_dir)
    print(f"Found {len(bonafide_files)} Bonafide files and {len(deepfake_files)} Deepfake files")

    # Validate dataset
    if len(bonafide_files) == 0:
        raise ValueError("No Bonafide files found. Cannot proceed with preprocessing.")
    if len(deepfake_files) == 0:
        raise ValueError("No Deepfake files found. Cannot proceed with preprocessing.")

    # Balance dataset
    random.seed(42)  # For reproducibility
    max_samples = min(N_SAMPLES_PER_CLASS, len(bonafide_files), len(deepfake_files))
    if max_samples < N_SAMPLES_PER_CLASS:
        print(f"Warning: Reducing samples per class to {max_samples} due to limited files")

    bonafide_files = random.sample(bonafide_files, max_samples)
    deepfake_files = random.sample(deepfake_files, max_samples)

    # Initialize arrays
    X = []
    y = []
    file_list = []

    # Process Bonafide files (label = 0)
    print("Processing Bonafide files...")
    for file_path in bonafide_files:
        mfcc = extract_mfcc(file_path)
        if mfcc is not None:
            X.append(mfcc)
            y.append(0)
            file_list.append({'file_path': file_path, 'label': 'Bonafide'})

    # Process Deepfake files (label = 1)
    print("Processing Deepfake files...")
    for file_path in deepfake_files:
        mfcc = extract_mfcc(file_path)
        if mfcc is not None:
            X.append(mfcc)
            y.append(1)
            file_list.append({'file_path': file_path, 'label': 'Deepfake'})

    # Convert to NumPy arrays
    X = np.array(X)
    y = np.array(y)

    # Validate output
    if len(X) == 0:
        raise ValueError("No valid features extracted. Check audio files for errors.")

    # Shuffle data
    X, y = shuffle(X, y, random_state=42)

    # Save features and labels
    np.save(os.path.join(output_dir, 'X.npy'), X)
    np.save(os.path.join(output_dir, 'y.npy'), y)

    # Save file list
    file_df = pd.DataFrame(file_list)
    file_df.to_csv(os.path.join(output_dir, 'file_list.csv'), index=False)

    print(f"Preprocessed {len(X)} samples with {X.shape[1]} features each")
    print(f"Saved features to {output_dir}/X.npy and labels to {output_dir}/y.npy")
    print(f"Saved file list to {output_dir}/file_list.csv")

if __name__ == "__main__":
    dataset_dir = "/content/deepfake_detection_dataset_urdu"  # Updated path
    preprocess_dataset(dataset_dir)

Scanning Bonafide folder: /content/deepfake_detection_dataset_urdu/Bonafide
Scanning Deepfake folder: /content/deepfake_detection_dataset_urdu/Spoofed_Tacotron
Scanning Deepfake folder: /content/deepfake_detection_dataset_urdu/Spoofed_TTS
Found 3398 Bonafide files and 3396 Deepfake files
Processing Bonafide files...
Processing Deepfake files...
Preprocessed 1000 samples with 3900 features each
Saved features to preprocessed/X.npy and labels to preprocessed/y.npy
Saved file list to preprocessed/file_list.csv


In [3]:
import numpy as np
y = np.load('preprocessed/y.npy')
print(f"Bonafide samples (label 0): {sum(y == 0)}")
print(f"Deepfake samples (label 1): {sum(y == 1)}")

Bonafide samples (label 0): 500
Deepfake samples (label 1): 500


In [4]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import joblib
import os

# Load preprocessed data
X = np.load('preprocessed/X.npy')
y = np.load('preprocessed/y.npy')

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize models
svm_model = SVC(probability=True, random_state=42)
lr_model = LogisticRegression(random_state=42)
perceptron_model = Perceptron(random_state=42)
dnn_model = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=500, random_state=42)

# Train models
print("Training SVM...")
svm_model.fit(X_train, y_train)
print("Training Logistic Regression...")
lr_model.fit(X_train, y_train)
print("Training Perceptron...")
perceptron_model.fit(X_train, y_train)
print("Training DNN...")
dnn_model.fit(X_train, y_train)

# Make predictions
models = {'SVM': svm_model, 'Logistic Regression': lr_model, 'Perceptron': perceptron_model, 'DNN': dnn_model}
for name, model in models.items():
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else (y_pred).astype(float)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc_roc = roc_auc_score(y_test, y_prob) if hasattr(model, 'predict_proba') else None

    print(f"\n{name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    if auc_roc is not None:
        print(f"AUC-ROC: {auc_roc:.4f}")
    else:
        print("AUC-ROC: Not available (model does not support probabilities)")

# Create models directory if it doesn't exist
os.makedirs('models', exist_ok=True)

# Save models
joblib.dump(svm_model, 'models/svm_model.pkl')
joblib.dump(lr_model, 'models/lr_model.pkl')
joblib.dump(perceptron_model, 'models/perceptron_model.pkl')
joblib.dump(dnn_model, 'models/dnn_model.pkl')
print("\nModels saved to 'models/' directory")

Training SVM...
Training Logistic Regression...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training Perceptron...
Training DNN...

SVM Results:
Accuracy: 0.8050
Precision: 0.8144
Recall: 0.7900
F1-Score: 0.8020
AUC-ROC: 0.9079

Logistic Regression Results:
Accuracy: 0.7950
Precision: 0.7980
Recall: 0.7900
F1-Score: 0.7940
AUC-ROC: 0.8712

Perceptron Results:
Accuracy: 0.7350
Precision: 0.6942
Recall: 0.8400
F1-Score: 0.7602
AUC-ROC: Not available (model does not support probabilities)

DNN Results:
Accuracy: 0.7550
Precision: 0.7802
Recall: 0.7100
F1-Score: 0.7435
AUC-ROC: 0.8546

Models saved to 'models/' directory


In [5]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import joblib
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load preprocessed data
X = np.load('preprocessed/X.npy')
y = np.load('preprocessed/y.npy')

# Split into training and test sets (same split as training)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Load models
svm_model = joblib.load('models/svm_model.pkl')
lr_model = joblib.load('models/lr_model.pkl')
perceptron_model = joblib.load('models/perceptron_model.pkl')
dnn_model = joblib.load('models/dnn_model.pkl')

# Evaluate models
models = {'SVM': svm_model, 'Logistic Regression': lr_model, 'Perceptron': perceptron_model, 'DNN': dnn_model}
results = []

for name, model in models.items():
    # Make predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc_roc = roc_auc_score(y_test, y_prob) if y_prob is not None else None

    # Append results
    results.append({
        'Model': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC-ROC': auc_roc if auc_roc is not None else 'N/A'
    })

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Bonafide', 'Deepfake'], yticklabels=['Bonafide', 'Deepfake'])
    plt.title(f'Confusion Matrix - {name}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.savefig(f'confusion_matrix_{name.lower().replace(" ", "_")}.png')
    plt.close()

# Create comparison table
results_df = pd.DataFrame(results)
print("\nModel Comparison Table:")
print(results_df.to_string(index=False))
results_df.to_csv('model_comparison.csv', index=False)

# Plot comparison bar chart
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
fig, ax = plt.subplots(figsize=(10, 6))
x = np.arange(len(models))
width = 0.2

for i, metric in enumerate(metrics):
    ax.bar(x + i * width, results_df[metric], width, label=metric)

ax.set_xticks(x + width * 1.5)
ax.set_xticklabels(models.keys())
ax.set_title('Model Performance Comparison')
ax.set_ylabel('Score')
ax.legend()
plt.tight_layout()
plt.savefig('model_comparison_bar_chart.png')
plt.close()

print("\nConfusion matrices saved as PNG files.")
print("Comparison table saved to 'model_comparison.csv'")
print("Comparison bar chart saved to 'model_comparison_bar_chart.png'")


Model Comparison Table:
              Model  Accuracy  Precision  Recall  F1-Score  AUC-ROC
                SVM     0.805   0.814433    0.79  0.802030   0.9079
Logistic Regression     0.795   0.797980    0.79  0.793970   0.8712
         Perceptron     0.735   0.694215    0.84  0.760181      N/A
                DNN     0.755   0.780220    0.71  0.743455  0.85455

Confusion matrices saved as PNG files.
Comparison table saved to 'model_comparison.csv'
Comparison bar chart saved to 'model_comparison_bar_chart.png'


In [9]:
import gradio as gr
import numpy as np
import librosa
import joblib

SAMPLE_RATE = 16000
N_MFCC = 13
MAX_FRAMES = 300

def extract_mfcc(path):
    audio, _ = librosa.load(path, sr=SAMPLE_RATE)
    mfcc = librosa.feature.mfcc(y=audio, sr=SAMPLE_RATE, n_mfcc=N_MFCC)
    if mfcc.shape[1] > MAX_FRAMES:
        mfcc = mfcc[:, :MAX_FRAMES]
    else:
        mfcc = np.pad(mfcc, ((0,0),(0,MAX_FRAMES-mfcc.shape[1])), mode='constant')
    return mfcc.flatten()

def predict_audio(audio_path, model_choice):
    model_paths = {
        "SVM": "/content/models/svm_model.pkl",
        "Logistic Regression": "/content/models/lr_model.pkl",
        "DNN": "/content/models/dnn_model.pkl",
        "Perceptron": "/content/models/perceptron_model.pkl"
    }
    model = joblib.load(model_paths[model_choice])
    feats = extract_mfcc(audio_path).reshape(1, -1)
    pred = model.predict(feats)[0]
    label = "Bonafide" if pred == 0 else "Deepfake"
    conf = None
    if hasattr(model, "predict_proba"):
        conf = float(np.max(model.predict_proba(feats)))
    return label, f"{conf:.2%}" if conf is not None else "N/A"

with gr.Blocks() as demo:
    gr.Markdown("## Zainab's Deep Fake detector")
    with gr.Row():
        # <-- removed source="upload"
        audio_in = gr.Audio(label="Upload .wav/.mp3", type="filepath")
        model_sel = gr.Radio(["SVM", "Logistic Regression", "DNN", "Perceptron"],
                             value="SVM", label="Model")
    btn = gr.Button("Analyze")
    out_label = gr.Label(label="Prediction")
    out_conf  = gr.Textbox(label="Confidence", interactive=False)
    btn.click(predict_audio, [audio_in, model_sel], [out_label, out_conf])

# share=True gives you a public URL from Colab
demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://71e5f627ce619c0122.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


