### PREPROCESSING THE MOZILLA COMMON VOICE

In [None]:
# Install NumPy
%pip install numpy

# Install Pandas
%pip install pandas

# Install tqdm for progress bars
%pip install tqdm

# Install imbalanced-learn for the imblearn package
%pip install imbalanced-learn

# Install praat-parselmouth for audio processing
%pip install praat-parselmouth

In [2]:
# path to the cv-other-train csv file 
path_training_csv = "path to the cv-other-train csv file"

# path to the cv-other-train folder
path_training = "path to the cv-other-train folder" + "/"

# path to save the balanced data
folder_path = "path to save the balanced data"

In [3]:
import numpy as np 
import pandas as pd 
import os
import tqdm
import imblearn
from imblearn.over_sampling import RandomUnderSampler

In [39]:
label2int = {
    "male": 1,
    "female": 0
}
df = pd.read_csv(path_training_csv)
df = df[df['gender'].notna()]
df = df[df['gender'] != 'other']
df = df[["filename", "gender"]]
# get total samples
n_samples = len(df)
# get total male samples
n_male_samples = len(df[df['gender'] == 'male'])
# get total female samples
n_female_samples = len(df[df['gender'] == 'female'])
print("Total samples:", n_samples)
print("Total male samples:", n_male_samples)
print("Total female samples:", n_female_samples)
df['gender']=df['gender'].apply(lambda x: label2int[x])
print(df.head())

Total samples: 63253
Total male samples: 49398
Total female samples: 13855
                           filename  gender
0  cv-other-train/sample-000000.mp3       1
1  cv-other-train/sample-000001.mp3       1
3  cv-other-train/sample-000003.mp3       1
5  cv-other-train/sample-000005.mp3       1
7  cv-other-train/sample-000007.mp3       0


### UnderSampling

In [40]:

X = df['filename'].values.reshape(-1, 1) 
y = df['gender'].values
rus = RandomUnderSampler()
X_rus, y_rus = rus.fit_resample(X, y)
df = pd.DataFrame({'filename': X_rus.ravel(), 'gender': y_rus})
print("Males after rebalancing :" , np.count_nonzero(y_rus==1))
print("Females after rebalancing :" , np.count_nonzero(y_rus==0))


Males after rebalancing : 13855
Females after rebalancing : 13855


## Filtering silent audios

### Using parselmouth : worked

In [37]:
import parselmouth

def is_audio_silent2(audio_path, silence_threshold=40):
    # Load the audio file
    sound = parselmouth.Sound(audio_path)

    # Measure the intensity
    intensity = sound.to_intensity()

    # Calculate the mean intensity
    mean_intensity = intensity.values.T.mean()

    # Determine if the audio is silent
    is_silent = mean_intensity < silence_threshold
    return is_silent

    


True


In [41]:
df['is silent'] = df['filename'].apply(lambda x: is_audio_silent2(path_training + x))
print(df.head())
df =df[df['is silent'] == False]
print(df.head())
n_samples = len(df)
# get total male samples
n_male_samples = len(df[df['gender'] == 0])
# get total female samples
n_female_samples = len(df[df['gender'] == 1])
print("Total samples:", n_samples)
print("Total male samples:", n_male_samples)
print("Total female samples:", n_female_samples)


                           filename  gender  is silent
0  cv-other-train/sample-000007.mp3       0      False
1  cv-other-train/sample-000009.mp3       0      False
2  cv-other-train/sample-000022.mp3       0      False
3  cv-other-train/sample-000041.mp3       0      False
4  cv-other-train/sample-000046.mp3       0      False
                           filename  gender  is silent
0  cv-other-train/sample-000007.mp3       0      False
1  cv-other-train/sample-000009.mp3       0      False
2  cv-other-train/sample-000022.mp3       0      False
3  cv-other-train/sample-000041.mp3       0      False
4  cv-other-train/sample-000046.mp3       0      False
Total samples: 23183
Total male samples: 11599
Total female samples: 11584


### Saving the Balanced Data

In [43]:

file_name = "Balanced_data.csv"
full_path = os.path.join(folder_path, file_name)