Process Data

## Sample data
* 4.5k real voices from FoR
* 4.5k fake voices from against Faird + RawNet
* 1.0k fake voices from against Deep4SNet (as testing later)

In [7]:
import os
import random

# Define the directories
dir_real_for = r'Data\DeepVC-Dataset\RQ1\for-real-validation'
dir_fake_farid = r'Data\DeepVC-Dataset\RQ3\for-bh-madefake-final-r4k'
dir_fake_rawnet = r'Data\DeepVC-Dataset\RQ3\for-rawnet-madefake-final-r4k'
dir_fake_deep4s = r'Data\DeepVC-Dataset\RQ3\for-deep4s-madefake-final-r4k'

# Function to get .wav files from a directory
def get_wav_files(directory):
    return [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.wav')]

# Get a list of all .wav file paths in the directories
files_real_for = get_wav_files(dir_real_for)
files_fake_farid = get_wav_files(dir_fake_farid)
files_fake_rawnet = get_wav_files(dir_fake_rawnet)
files_fake_deep4s = get_wav_files(dir_fake_deep4s)

# Check files exist
print("All files:")
print("RQ1/for-real-validation:", len(files_real_for))
print("RQ3/for-bh-madefake-final-r4k:", len(files_fake_farid))
print("RQ3/for-rawnet-madefake-final-r4k:", len(files_fake_rawnet))
print("RQ3/for-deep4s-madefake-final-r4k:", len(files_fake_deep4s))

# Function to sample files safely
def sample_files(file_list, sample_size):
    return random.sample(file_list, min(sample_size, len(file_list))) if len(file_list) >= sample_size else file_list

# Randomly sample from each file path
sample_real = sample_files(files_real_for, 4500)
sample_fake_1 = sample_files(files_fake_farid, 2995)
sample_fake_2 = sample_files(files_fake_rawnet, 1505)
sample_fake_deep4s = sample_files(files_fake_deep4s, 1000)

# Check samples exist
print("\nSample sizes:")
print("RQ1/for-real-validation:", len(sample_real))
print("RQ3/for-bh-madefake-final-r4k:", len(sample_fake_1))
print("RQ3/for-rawnet-madefake-final-r4k:", len(sample_fake_2))
print("RQ3/for-deep4s-madefake-final-r4k:", len(sample_fake_deep4s))


All files:
RQ1/for-real-validation: 5400
RQ3/for-bh-madefake-final-r4k: 8720
RQ3/for-rawnet-madefake-final-r4k: 1505
RQ3/for-deep4s-madefake-final-r4k: 9015

Sample sizes:
RQ1/for-real-validation: 4500
RQ3/for-bh-madefake-final-r4k: 2995
RQ3/for-rawnet-madefake-final-r4k: 1505
RQ3/for-deep4s-madefake-final-r4k: 1000


In [8]:
import os
import random

# Define directories for the datasets
dir_real_for = r'Data\DeepVC-Dataset\RQ1\for-real-validation'
dir_fake_farid = r'Data\DeepVC-Dataset\RQ3\for-bh-madefake-final-r4k'
dir_fake_rawnet = r'Data\DeepVC-Dataset\RQ3\for-rawnet-madefake-final-r4k'
dir_fake_deep4s = r'Data\DeepVC-Dataset\RQ3\for-deep4s-madefake-final-r4k'

# List all .wav files in each directory
files_real_for = [os.path.join(dir_real_for, f) for f in os.listdir(dir_real_for) if f.endswith('.wav')]
files_fake_farid = [os.path.join(dir_fake_farid, f) for f in os.listdir(dir_fake_farid) if f.endswith('.wav')]
files_fake_rawnet = [os.path.join(dir_fake_rawnet, f) for f in os.listdir(dir_fake_rawnet) if f.endswith('.wav')]
files_fake_deep4s = [os.path.join(dir_fake_deep4s, f) for f in os.listdir(dir_fake_deep4s) if f.endswith('.wav')]

# Sample files from each dataset
sample_real = random.sample(files_real_for, 4500)
sample_fake_1 = random.sample(files_fake_farid, 2995)
sample_fake_2 = random.sample(files_fake_rawnet, 1505)
sample_fake_deep4s = random.sample(files_fake_deep4s, 1000)

# Print the sizes of the sampled datasets
print("\nSample sizes:")
print("RQ1/for-real-validation ", len(sample_real))
print("RQ3/for-bh-madefake-final-r4k ", len(sample_fake_1))
print("RQ3/for-rawnet-madefake-final-r4k ", len(sample_fake_2))
print("RQ3/for-deep4s-madefake-final-r4k ", len(sample_fake_deep4s))

# Create a dictionary to store labels
data = {}
for file_path in sample_real:
    data[file_path] = 'real'
for file_path in sample_fake_1 + sample_fake_2 + sample_fake_deep4s:
    data[file_path] = 'fake'

# Count the number of real and fake files
num_real = sum(1 for label in data.values() if label == 'real')
num_fake = sum(1 for label in data.values() if label == 'fake')

# Print total number of files and the counts of real and fake
print("Total number of files:", len(data))
print("Number of real files:", num_real)
print("Number of fake files:", num_fake)

# Display labels for the first 3 files
print("\nLabels for the first 3 files:")
for file_path, label in list(data.items())[:3]:
    print(file_path, "->", label)

if len(data) >= 3:
    print("\nLabels for the last 3 files:")
    for file_path, label in list(data.items())[-3:]:
        print(file_path, "->", label)
else:
    print("\nNot enough files to display the last 3.")

# Create a separate dictionary for data_deep4s
data_deep4s = {file_path: 'fake' for file_path in sample_fake_deep4s}

# Print the total number of files in data_deep4s
print("\nTotal number of files in data_deep4s:", len(data_deep4s))

# Display labels for the first 3 files in data_deep4s
print("\nLabels for the first 3 files in data_deep4s:")
for file_path, label in list(data_deep4s.items())[:3]:
    print(file_path, "->", label)

if len(data_deep4s) >= 3:
    print("\nLabels for the last 3 files in data_deep4s:")
    for file_path, label in list(data_deep4s.items())[-3:]:
        print(file_path, "->", label)
else:
    print("\nNot enough files in data_deep4s to display the last 3.")


Sample sizes:
RQ1/for-real-validation  4500
RQ3/for-bh-madefake-final-r4k  2995
RQ3/for-rawnet-madefake-final-r4k  1505
RQ3/for-deep4s-madefake-final-r4k  1000
Total number of files: 10000
Number of real files: 4500
Number of fake files: 5500

Labels for the first 3 files:
Data\DeepVC-Dataset\RQ1\for-real-validation\file31830.wav_16k.wav_norm.wav_mono.wav_silence.wav -> real
Data\DeepVC-Dataset\RQ1\for-real-validation\file30610.wav_16k.wav_norm.wav_mono.wav_silence.wav -> real
Data\DeepVC-Dataset\RQ1\for-real-validation\file239.wav_16k.wav_norm.wav_mono.wav_silence.wav -> real

Labels for the last 3 files:
Data\DeepVC-Dataset\RQ3\for-deep4s-madefake-final-r4k\file32831.wav_16k.wav_norm.wav_mono.wav_silence.wav_02.wav.noisered.wav -> fake
Data\DeepVC-Dataset\RQ3\for-deep4s-madefake-final-r4k\file6050.wav_16k.wav_norm.wav_mono.wav_silence.wav_00.wav.noisered.wav -> fake
Data\DeepVC-Dataset\RQ3\for-deep4s-madefake-final-r4k\file13604.wav_16k.wav_norm.wav_mono.wav_silence.wav_02.wav.noise

## Split data
* Train: 70%
* Validation: 15%
* Test: 15%

In [9]:
import random
import shutil

# Shuffle the dictionary keys
keys = list(data.keys())
random.shuffle(keys)
deep4s_keys = list(data_deep4s.keys())
random.shuffle(deep4s_keys)

# Calculate the sizes of each set
total_size = len(keys)
train_size = int(total_size * 0.7)
val_size = int(total_size * 0.15)

# Divide the keys into training, validation, and testing sets
train_keys = keys[:train_size]
val_keys = keys[train_size:train_size + val_size]
test_keys = keys[train_size + val_size:]

# Retrieve the corresponding file paths and labels for each set
train_set = [(key, data[key]) for key in train_keys]
val_set = [(key, data[key]) for key in val_keys]
test_set = [(key, data[key]) for key in test_keys]
deep4s_set = [(key, data_deep4s[key]) for key in deep4s_keys]

# Create the output directory
output_dir = 'filtered_data'  # Directory for filtered data
os.makedirs(output_dir, exist_ok=True)

# Create subdirectories for train, val, and test sets
os.makedirs(os.path.join(output_dir, 'train'), exist_ok=True)
os.makedirs(os.path.join(output_dir, 'val'), exist_ok=True)
os.makedirs(os.path.join(output_dir, 'test'), exist_ok=True)

# Function to save file paths to their respective directories
def save_to_directory(data_set, directory):
    for file_path, label in data_set:
        # Copy the file to the appropriate directory
        shutil.copy(file_path, os.path.join(directory, os.path.basename(file_path)))

# Save each set
save_to_directory(train_set, os.path.join(output_dir, 'train'))
save_to_directory(val_set, os.path.join(output_dir, 'val'))
save_to_directory(test_set, os.path.join(output_dir, 'test'))

# Print sizes of each set
print("Training set size:", len(train_set))
print("Validation set size:", len(val_set))
print("Testing set size:", len(test_set))
print("Deep4S set size:", len(deep4s_set))

Training set size: 7000
Validation set size: 1500
Testing set size: 1500
Deep4S set size: 1000


In [10]:
# Check 1 example from each
print("\n", train_set[0])
print("\n", val_set[0])
print("\n", test_set[0])
print("\n", deep4s_set[0])


 ('Data\\DeepVC-Dataset\\RQ1\\for-real-validation\\file32394.wav_16k.wav_norm.wav_mono.wav_silence.wav', 'real')

 ('Data\\DeepVC-Dataset\\RQ3\\for-bh-madefake-final-r4k\\file14267.wav_16k.wav_norm.wav_mono.wav_silence.wav_00.wav.noisered.wav', 'fake')

 ('Data\\DeepVC-Dataset\\RQ3\\for-deep4s-madefake-final-r4k\\file1539.wav_16k.wav_norm.wav_mono.wav_silence.wav_01.wav.noisered.wav', 'fake')

 ('Data\\DeepVC-Dataset\\RQ3\\for-deep4s-madefake-final-r4k\\file26442.wav_16k.wav_norm.wav_mono.wav_silence.wav_03.wav.noisered.wav', 'fake')


# Feature Extraction (ONLY RUN ONCE)
- no need to run if you have histograms stored in set folders
- extracts histograms from SiF-DeepVC, not H-Voice. H-Voice already comes with histograms.

### Histograms
### Regular Function
- no limitations
- imiate H-voice histograms

In [13]:
import numpy as np
import librosa.display
import matplotlib.pyplot as plt
def compute_histogram(file_path, dir, iter):
    # Load audio file
    audio, sr = librosa.load(file_path, sr=None)

    # Calculate histogram of audio
    hist, bins = np.histogram(audio, bins=256, range=(-1, 1)) # Ours: 2^8 | Original: 2^16 bins

    # Plot histogram
    plt.figure()
    plt.bar(bins[:-1], hist, width=(bins[1] - bins[0]), color='black')
    #plt.title('Histogram of Audio')
    #plt.xlabel('Amplitude')
    #plt.ylabel('Frequency')
    plt.savefig(os.path.join(dir, f'hist_{iter}.png'))
    plt.close()
    #plt.show()
    #print(hist.shape)
    #print(hist.dtype)
    return hist

### Filtered Function - Limit Histograms under 4 kHz (WIP)
- Create a more generalized model by training our model on both H-Voice and SiF-DeepVC data sets.
- Limit the histograms to below 4000 Hz. Since the SiF-DeepVC's handcrafted SiFs were designed at above 4k Hz, we want to test the model's capabilities when ignoring the SiFs.

In [14]:
import numpy as np
import matplotlib.pyplot as plt
import librosa.display

# Function to plot spectrogram
def plot_spectrogram(audio, sr, title):
    plt.figure(figsize=(10, 4))
    spectrogram = librosa.display.specshow(librosa.amplitude_to_db(np.abs(librosa.stft(audio)), ref=np.max), sr=sr, x_axis='time', y_axis='log')
    plt.colorbar(format='%+2.0f dB')
    plt.title(title)

    plt.tight_layout()
    #plt.show()
    plt.close()

In [15]:
import numpy as np
import scipy.signal
import scipy.io.wavfile

def filter(audio_data, cutoff_frequency, sr):
    # Define the filter
    nyquist_frequency = sr / 2
    cutoff_normalized = cutoff_frequency / nyquist_frequency
    b, a = scipy.signal.butter(4, cutoff_normalized, btype='low')

    # Apply the filter to each channel
    filtered_audio = np.apply_along_axis(lambda x: scipy.signal.filtfilt(b, a, x), axis=0, arr=audio_data)

    return filtered_audio

# Load the original audio file
#sampling_rate, audio_data = scipy.io.wavfile.read(file_path)

In [16]:
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import os
def compute_histogram_filtered(file_path, dir, iter):
    # Load audio file
    audio, sr = librosa.load(file_path, sr=44100)
    # Plot spectrogram of original audio
    #plot_spectrogram(audio, 44100, title='Original Audio Spectrogram')


    cutoff_frequency = 4000
    filtered_audio = filter(audio, cutoff_frequency, sr=44100)
    # Plot spectrogram of filtered audio
    #plot_spectrogram(filtered_audio, 44100, title='Filtered Audio Spectrogram')
    # Calculate histogram of audio
    hist, bins = np.histogram(filtered_audio, bins=256, range=(-1, 1)) # 2^8 bins

    # Plot histogram
    plt.figure()
    plt.bar(bins[:-1], hist, width=(bins[1] - bins[0]), color='black')
    #plt.title('Histogram of Audio')
    #plt.xlabel('Amplitude')
    #plt.ylabel('Frequency')
    plt.savefig(os.path.join(dir, f'hist_{iter}.png'))
    #plt.show()
    plt.close()
    #print(hist.shape)
    #print(hist.dtype)
    return hist

# Test
compute_histogram_filtered(test_set[0][0], r"E:\Data_Voice-Id\filtered_data\train", 2)

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,    15,    63,
         170,   215,   328,   269,   366,   346,   954,  1595,  3593,
        6557, 16118,

### Store Histograms - Regular

In [None]:
import os

# Define the root directory
root_dir = 'E:/Data_Voice-Id/'  # Update with your actual root directory

# New save directory structure
save_dir = os.path.join(root_dir, 'hist_filtered_data', 'training_data')

# Create the save directory if it doesn't exist
if not os.path.isdir(save_dir):
    os.makedirs(save_dir)

batch_size = 256
num_batches = len(train_set) // batch_size  # Calculate the number of batches

for batch_index in range(num_batches):
    start_index = batch_index * batch_size
    end_index = (batch_index + 1) * batch_size

    for i, (file_path, label) in enumerate(train_set[start_index:end_index]):
        # Create a label directory within the new save directory
        label_dir = os.path.join(save_dir, label)
        if not os.path.isdir(label_dir):
            os.makedirs(label_dir)  # Create the directory if it doesn't exist

        # Compute and save the histogram
        hist = compute_histogram(file_path, label_dir, i + start_index)

# Process the remaining items (if any) after the last full batch
remaining_items = len(train_set) % batch_size
if remaining_items > 0:
    start_index = num_batches * batch_size
    for i, (file_path, label) in enumerate(train_set[start_index:]):
        label_dir = os.path.join(save_dir, label)
        if not os.path.isdir(label_dir):
            os.makedirs(label_dir)  # Create the directory if it doesn't exist

        # Compute and save the histogram
        hist = compute_histogram(file_path, label_dir, i + start_index)


In [2]:
import os

# Define the directory you want to count files in
directory_path1 = r'hist_filtered_data\training_data\real'  # Use raw string or double backslashes
directory_path2 = r'hist_filtered_data\training_data\fake'  # Use raw string or double backslashes

# Count the number of files
num_files1 = len([f for f in os.listdir(directory_path1) if os.path.isfile(os.path.join(directory_path1, f))])
num_files2 = len([f for f in os.listdir(directory_path2) if os.path.isfile(os.path.join(directory_path2, f))])
num_files=num_files1+num_files2

print(f"Total number of files in : {num_files}")



Total number of files in : 5465


In [17]:
root_dir = 'E:/Data_Voice-Id/'  # Update with your actual root directory

# New save directory for validation data
save_dir = os.path.join(root_dir, 'hist_filtered_data', 'validation_data')

# Create the save directory if it doesn't exist
if not os.path.isdir(save_dir):
    os.makedirs(save_dir)

iter = 0

for file_path, label in val_set:
    label_dir = os.path.join(save_dir, label)
    if not os.path.isdir(label_dir):
        os.makedirs(label_dir)  # Create the directory if it doesn't exist

    # Compute and save the histogram
    hist = compute_histogram(file_path, label_dir, iter)
    iter += 1

In [18]:
# Define the root directory
root_dir = 'E:/Data_Voice-Id/'  # Update with your actual root directory

# New save directory for test data
save_dir = os.path.join(root_dir, 'hist_filtered_data', 'test_data')

# Create the save directory if it doesn't exist
if not os.path.isdir(save_dir):
    os.makedirs(save_dir)

iter = 0

for file_path, label in test_set:
    label_dir = os.path.join(save_dir, label)
    if not os.path.isdir(label_dir):
        os.makedirs(label_dir)  # Create the directory if it doesn't exist

    # Compute and save the histogram
    hist = compute_histogram(file_path, label_dir, iter)
    iter += 1

In [19]:
# Define the root directory
root_dir = 'E:/Data_Voice-Id/'  # Update with your actual root directory

# New save directory for Deep4SNet target test data
save_dir = os.path.join(root_dir, 'hist_filtered_data', 'deep4s_data')

# Create the save directory if it doesn't exist
if not os.path.isdir(save_dir):
    os.makedirs(save_dir)

iter = 0

for file_path, label in deep4s_set:
    label_dir = os.path.join(save_dir, label)
    if not os.path.isdir(label_dir):
        os.makedirs(label_dir)  # Create the directory if it doesn't exist

    # Compute and save the histogram
    hist = compute_histogram(file_path, label_dir, iter)
    iter += 1


In [None]:
# Count histograms in each directory
def count_png_files(directory):
    count = 0
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".png"):
                count += 1
    return count

directories = [
    root_dir + 'Voice_Cloning_Detection/Data/SiF-DeepVC/Training_Set/',
    root_dir + 'Voice_Cloning_Detection/Data/SiF-DeepVC/Validation_Set/',
    root_dir + 'Voice_Cloning_Detection/Data/SiF-DeepVC/Test_Set/',
    root_dir + 'Voice_Cloning_Detection/Data/SiF-DeepVC/Deep4SNet_Target_Test_Set/'
]

for directory in directories:
    total_count = 0
    print("Directory:", directory)
    for sub_dir in os.listdir(directory):
        sub_dir_path = os.path.join(directory, sub_dir)
        if os.path.isdir(sub_dir_path):
            png_count = count_png_files(sub_dir_path)
            total_count += png_count
            print("   Subdirectory:", sub_dir, "| Histograms:", png_count)
    print("Total histograms: ", total_count)
    print()