In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mohammedabdeldayem/the-fake-or-real-dataset")

print("Path to dataset files:", path)


Resuming download from 3185573888 bytes (14026199359 bytes left)...
Resuming download from https://www.kaggle.com/api/v1/datasets/download/mohammedabdeldayem/the-fake-or-real-dataset?dataset_version_number=2 (3185573888/17211773247) bytes left.


100%|██████████| 16.0G/16.0G [27:01<00:00, 8.65MB/s]  

Extracting files...





Path to dataset files: C:\Users\91986\.cache\kagglehub\datasets\mohammedabdeldayem\the-fake-or-real-dataset\versions\2


In [4]:
import os
import numpy as np
import librosa
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

# ✅ STEP 1: Dataset Paths (Change if needed)
base_path = r"C:\Users\91986\.cache\kagglehub\datasets\mohammedabdeldayem\the-fake-or-real-dataset\versions\2"
train_dir = os.path.join(base_path, "for-norm", "for-norm", "training")
val_dir = os.path.join(base_path, "for-norm", "for-norm", "validation")
test_dir = os.path.join(base_path, "for-rerec", "for-rerecorded", "testing")



In [5]:
def extract_features(file_path, n_mfcc=13):
    try:
        audio, sr = librosa.load(file_path, sr=None)
        if len(audio) < 2048:
            print(f"⚠️ Skipping short file: {file_path}")
            return None
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
        return np.mean(mfccs.T, axis=0)
    except Exception as e:
        print(f"❌ Error with {file_path}: {e}")
        return None


# ✅ STEP 3: Dataset Loader
def load_dataset(folder):
    X, y = [], []
    for label_name, label_val in [('real', 0), ('fake', 1)]:
        label_path = os.path.join(folder, label_name)
        for fname in os.listdir(label_path):
            if fname.endswith('.wav'):
                fpath = os.path.join(label_path, fname)
                feat = extract_features(fpath)
                if feat is not None:
                    X.append(feat)
                    y.append(label_val)
    return np.array(X), np.array(y)



In [6]:
# ✅ STEP 4: Load Train/Validation/Test Data
print("📦 Loading datasets...")
X_train, y_train = load_dataset(train_dir)
X_val, y_val = load_dataset(val_dir)
X_test, y_test = load_dataset(test_dir)

# ✅ STEP 5: Model Candidates
models = {
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "SVM": SVC(kernel='rbf', probability=True),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=100)
}



📦 Loading datasets...
⚠️ Skipping short file: C:\Users\91986\.cache\kagglehub\datasets\mohammedabdeldayem\the-fake-or-real-dataset\versions\2\for-norm\for-norm\training\real\file11064.wav_16k.wav_norm.wav_mono.wav_silence.wav
⚠️ Skipping short file: C:\Users\91986\.cache\kagglehub\datasets\mohammedabdeldayem\the-fake-or-real-dataset\versions\2\for-norm\for-norm\training\real\file15440.wav_16k.wav_norm.wav_mono.wav_silence.wav
⚠️ Skipping short file: C:\Users\91986\.cache\kagglehub\datasets\mohammedabdeldayem\the-fake-or-real-dataset\versions\2\for-norm\for-norm\training\real\file15932.wav_16k.wav_norm.wav_mono.wav_silence.wav
⚠️ Skipping short file: C:\Users\91986\.cache\kagglehub\datasets\mohammedabdeldayem\the-fake-or-real-dataset\versions\2\for-norm\for-norm\training\fake\file2846.wav_16k.wav_norm.wav_mono.wav_silence.wav
⚠️ Skipping short file: C:\Users\91986\.cache\kagglehub\datasets\mohammedabdeldayem\the-fake-or-real-dataset\versions\2\for-norm\for-norm\validation\real\file16316

In [7]:
# ✅ STEP 6: Train & Evaluate Models
results = []
for name, model in models.items():
    print(f"\n🚀 Training {name}...")
    model.fit(X_train, y_train)
    val_pred = model.predict(X_val)
    acc = accuracy_score(y_val, val_pred)
    results.append((name, acc, model))
    print(f"✅ {name} Accuracy on Validation Set: {acc:.4f}")
    print(classification_report(y_val, val_pred))




🚀 Training RandomForest...
✅ RandomForest Accuracy on Validation Set: 0.9738
              precision    recall  f1-score   support

           0       0.98      0.97      0.97      5399
           1       0.97      0.98      0.97      5398

    accuracy                           0.97     10797
   macro avg       0.97      0.97      0.97     10797
weighted avg       0.97      0.97      0.97     10797


🚀 Training LogisticRegression...
✅ LogisticRegression Accuracy on Validation Set: 0.7699
              precision    recall  f1-score   support

           0       0.77      0.77      0.77      5399
           1       0.77      0.77      0.77      5398

    accuracy                           0.77     10797
   macro avg       0.77      0.77      0.77     10797
weighted avg       0.77      0.77      0.77     10797


🚀 Training SVM...
✅ SVM Accuracy on Validation Set: 0.8681
              precision    recall  f1-score   support

           0       0.88      0.85      0.87      5399
         

In [8]:
# ✅ STEP 7: Pick Best Model
best_model = max(results, key=lambda x: x[1])
best_name, best_acc, best_model_instance = best_model
print(f"\n🏆 Best Model: {best_name} (Accuracy: {best_acc:.4f})")

# ✅ STEP 8: Save Model in Same Folder as Notebook
notebook_dir = os.getcwd()
model_filename = f"best_model_{best_name}.pkl"
model_path = os.path.join(notebook_dir, model_filename)
joblib.dump(best_model_instance, model_path)
print(f"💾 Saved best model as: {model_filename} in {notebook_dir}")

# ✅ STEP 9: Final Evaluation on Rerecorded Test Set
print("\n🧪 Final Evaluation on Rerecorded Test Set:")
test_pred = best_model_instance.predict(X_test)
print(classification_report(y_test, test_pred))



🏆 Best Model: RandomForest (Accuracy: 0.9738)
💾 Saved best model as: best_model_RandomForest.pkl in c:\Users\91986\Desktop\DSPL MPR

🧪 Final Evaluation on Rerecorded Test Set:
              precision    recall  f1-score   support

           0       0.93      0.48      0.63       408
           1       0.65      0.96      0.78       408

    accuracy                           0.72       816
   macro avg       0.79      0.72      0.70       816
weighted avg       0.79      0.72      0.70       816

