In [None]:
import os
import numpy as np
import librosa
from hmmlearn import hmm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns

# Function to extract MFCC features from audio files
def extract_mfcc(file_path, n_mfcc=13):
    y, sr = librosa.load(file_path, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    return np.mean(mfcc.T, axis=0)

# Load data from a directory and assign labels (1 for real, 0 for fake)
def load_data(data_folder_real, data_folder_fake):
    X = []
    y = []
    
    for file_name in os.listdir(data_folder_real):
        if file_name.endswith(".wav"):  # Assuming audio files are in .wav format
            file_path = os.path.join(data_folder_real, file_name)
            mfcc_features = extract_mfcc(file_path)
            X.append(mfcc_features)
            y.append(1)  # Real audio label

    for file_name in os.listdir(data_folder_fake):
        if file_name.endswith(".wav"):
            file_path = os.path.join(data_folder_fake, file_name)
            mfcc_features = extract_mfcc(file_path)
            X.append(mfcc_features)
            y.append(0)  # Fake audio label

    return np.array(X), np.array(y)

# Train HMM model
def train_hmm(X_train):
    model = hmm.GaussianHMM(n_components=4, covariance_type="diag", n_iter=100)
    model.fit(X_train)
    return model

# Predict using HMM
def predict_hmm(model, X_test):
    predictions = []
    for features in X_test:
        logprob = model.score([features])
        predictions.append(logprob)
    return predictions

# Plot confusion matrix
def plot_confusion_matrix(cm, class_names):
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

# Plot all MFCC feature graphs
def plot_all_mfcc_features(X_test):
    for i, mfcc_features in enumerate(X_test):
        plt.figure(figsize=(10, 4))
        plt.plot(mfcc_features, label=f'MFCCs of sample {i+1}')
        plt.title(f"MFCC Feature Plot for Sample {i+1}")
        plt.legend()
        plt.show()

# Main execution
if _name_ == "_main_":
    data_folder_real = r"C:\Users\rishi\Downloads\KAGGLE\AUDIO\REAL"
    data_folder_fake = r"C:\Users\rishi\Downloads\KAGGLE\AUDIO\FAKE"  

    # Load the dataset
    X, y = load_data(data_folder_real, data_folder_fake)

    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train HMM model
    hmm_model = train_hmm(X_train)

    # Get predictions
    predictions = np.array(predict_hmm(hmm_model, X_test))

    # Convert log probabilities to binary class labels
    pred_labels = (predictions > np.mean(predictions)).astype(int)

    # Calculate accuracy and plot confusion matrix
    accuracy = accuracy_score(y_test, pred_labels)
    print(f"Accuracy: {accuracy:.2f}")

    cm = confusion_matrix(y_test, pred_labels)
    plot_confusion_matrix(cm, class_names=["Fake", "Real"])
    
    # Plot all MFCC features for all test samples
    plot_all_mfcc_features(X_test)