# **Data Preprocessing**

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["label"] = le.fit_transform(df["age"])

num_classes = len(le.classes_)
print("Classes:", le.classes_)

Classes: ['eighties' 'fifties' 'fourties' 'seventies' 'sixties' 'teens' 'thirties'
 'twenties']


In [None]:
from sklearn.model_selection import train_test_split

speakers = df["client_id"].unique()

train_spk, val_spk = train_test_split(
    speakers,
    test_size=0.2,
    random_state=42
)

train_df = df[df["client_id"].isin(train_spk)].copy()
val_df = df[df["client_id"].isin(val_spk)].copy()

print("Train:", len(train_df), "Val:", len(val_df))

Train: 44896 Val: 7988


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class MFCCDataset(Dataset):
    def __init__(self, dataframe, n_mfcc=40):
        self.df = dataframe
        self.n_mfcc = n_mfcc

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        signal, sr = librosa.load(row["full_path"], sr=16000)

        mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=self.n_mfcc)

        features = np.concatenate([
            np.mean(mfcc, axis=1),
            np.std(mfcc, axis=1)
        ])

        return torch.tensor(features, dtype=torch.float32), torch.tensor(row["label"])

In [None]:
train_dataset = MFCCDataset(train_df)
val_dataset = MFCCDataset(val_df)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=32, num_workers=2)