In [28]:
import pandas as pd
import re 
import numpy as np
import os
import tqdm


In [35]:
df = pd.read_csv("data/cv-valid-test.csv")
df

Unnamed: 0,new_file_name,gender
0,cv-valid-test/sample-000001.npy,male
1,cv-valid-test/sample-000003.npy,male
2,cv-valid-test/sample-000005.npy,male
3,cv-valid-test/sample-000006.npy,male
4,cv-valid-test/sample-000007.npy,male
...,...,...
1524,cv-valid-test/sample-003976.npy,male
1525,cv-valid-test/sample-003979.npy,male
1526,cv-valid-test/sample-003980.npy,male
1527,cv-valid-test/sample-003984.npy,male


In [38]:
label2int = {
    "male": 1,
    "female": 0
}


def load_data(vector_length=128):
    """A function to load gender recognition dataset from `data` folder
    After the second run, this will load from results/features.npy and results/labels.npy files
    as it is much faster!"""
    # make sure results folder exists
    if not os.path.isdir("results"):
        os.mkdir("results")
    # if features & labels already loaded individually and bundled, load them from there instead
    if os.path.isfile("results/features.npy") and os.path.isfile("results/labels.npy"):
        X = np.load("results/features.npy")
        y = np.load("results/labels.npy")
        return X, y
    # read dataframe
    df = pd.read_csv("data/train.csv")
    # get total samples
    n_samples = len(df)
    # get total male samples
    n_male_samples = len(df[df['gender'] == 'male'])
    # get total female samples
    n_female_samples = len(df[df['gender'] == 'female'])
    print("Total samples:", n_samples)
    print("Total male samples:", n_male_samples)
    print("Total female samples:", n_female_samples)
    # initialize an empty array for all audio features
    X = np.zeros((n_samples, vector_length))
    # initialize an empty array for all audio labels (1 for male and 0 for female)
    y = np.zeros((n_samples, 1))
    for i, (filename, gender) in tqdm.tqdm(enumerate(zip(df['file_path'], df['gender'])), "Loading data", total=n_samples):
        features = np.load(filename)
        X[i] = features
        y[i] = label2int[gender]
    # save the audio features and labels into files
    # so we won't load each one of them next run
    np.save("results/features", X)
    np.save("results/labels", y)
    return X, y

In [39]:
load_data(vector_length=128)

Total samples: 24
Total male samples: 21
Total female samples: 3


Loading data: 100%|██████████| 24/24 [00:00<00:00, 4010.65it/s]


(array([[8.42086077e-02, 2.89900601e-01, 7.50746965e-01, ...,
         1.04944611e-05, 3.19747528e-06, 2.16807948e-07],
        [6.07935945e-04, 4.89392667e-04, 9.58284887e-04, ...,
         7.27368970e-05, 1.75796504e-05, 8.67179835e-07],
        [9.74186044e-03, 1.03523461e-02, 9.12076682e-02, ...,
         1.17253985e-04, 5.33933635e-05, 7.06376977e-06],
        ...,
        [7.45988963e-03, 7.84977339e-03, 1.78759784e-01, ...,
         6.84204551e-08, 4.09083043e-08, 1.59165281e-09],
        [1.59764884e-03, 1.20121948e-02, 8.13301951e-02, ...,
         4.49410642e-09, 1.68088821e-09, 1.52752866e-10],
        [1.33251930e-02, 3.15138936e-01, 1.05973303e+00, ...,
         9.31911694e-04, 2.95696285e-04, 1.48342770e-05]]),
 array([[1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [0.],

In [21]:
# for x in df["new_file_name"]:
def clean_path(x):
    path = f"data/{x}" 
    print(path)
    file_name = re.sub(".npy","",str(x.split("/")[1]))
    return path

def clean_gender(x):
    if re.search("^male",x):
        return 1
    else:
        return 0




In [22]:
df["file_path"] = df["new_file_name"].apply(lambda x: clean_path(x))
df["gender_label"]= df["gender"].apply(lambda x: clean_gender(x))


data/cv-valid-test/sample-000001.npy
data/cv-valid-test/sample-000003.npy
data/cv-valid-test/sample-000005.npy
data/cv-valid-test/sample-000006.npy
data/cv-valid-test/sample-000007.npy
data/cv-valid-test/sample-000008.npy
data/cv-valid-test/sample-000009.npy
data/cv-valid-test/sample-000011.npy
data/cv-valid-test/sample-000014.npy
data/cv-valid-test/sample-000016.npy
data/cv-valid-test/sample-000018.npy
data/cv-valid-test/sample-000022.npy
data/cv-valid-test/sample-000024.npy
data/cv-valid-test/sample-000026.npy
data/cv-valid-test/sample-000031.npy
data/cv-valid-test/sample-000033.npy
data/cv-valid-test/sample-000034.npy
data/cv-valid-test/sample-000035.npy
data/cv-valid-test/sample-000036.npy
data/cv-valid-test/sample-000037.npy
data/cv-valid-test/sample-000038.npy
data/cv-valid-test/sample-000041.npy
data/cv-valid-test/sample-000042.npy
data/cv-valid-test/sample-000043.npy
data/cv-valid-test/sample-000051.npy
data/cv-valid-test/sample-000052.npy
data/cv-valid-test/sample-000055.npy
d

In [24]:
df.to_csv("data/train.csv")

In [26]:
def get_npy(df):
    data_list = []
    for x in df["file_path"]:
        data_list.append(np.load(x))
    return data_list

def get_label(df):
    data_list = []
    for x in df["gender_label"]:
        data_list.append(x)
    return data_list