# Dataset

This notebook is used as a first veiw on the dataset for understanding how the data can be used

## Make clean csv

We add multiple csv files for each language (train, test, dev). But there are useless information. Like the age the gender and the line said. Moreover, their is no columns for the language. So we had to créate our csv with all the needed information (id, path to the audio file, language). Here the langugage columns is our labels.

In [1]:
import polars as pl
import numpy as np
import os

In [23]:
dataset_path = "../data/common_voice_kpd/"
languages = os.listdir("../data/common_voice_kpd/")
dataset_path, languages[14]

('../data/common_voice_kpd/', 'Italian')

In [24]:
len(languages)

45

In [25]:
languages.index('Italian')

14

In [None]:
with open("../data/common_voice_kpd/French/train.csv", "r", encoding="utf-16") as f:
    lines = f.readline()
    lines = lines.split()
    client_id = np.array([])
    path = np.array([])
    language = np.array([])
    while True:
        lines = f.readline()
        if not lines:
            break
        lines = lines.split()
        client_id = np.append(client_id, np.int16(lines[0]))
        path = np.append(path, f"/train/{lines[1]}/{lines[2]}")
        language = np.append(language, "French")


    df = (
        pl.DataFrame({
            "client_id": client_id,
            "path":path,
            "language": language,
        })
    )

df
        

client_id,path,language
f64,str,str
98.0,"""/train/frn_trn_sp_98/common_vo…","""French"""
539.0,"""/train/frn_trn_sp_426/common_v…","""French"""
109.0,"""/train/frn_trn_sp_109/common_v…","""French"""
1068.0,"""/train/frn_trn_sp_691/common_v…","""French"""
1036.0,"""/train/frn_trn_sp_675/common_v…","""French"""
…,…,…
150.0,"""/train/frn_trn_sp_150/common_v…","""French"""
351.0,"""/train/frn_trn_sp_331/common_v…","""French"""
24.0,"""/train/frn_trn_sp_24/common_vo…","""French"""
1071.0,"""/train/frn_trn_sp_692/common_v…","""French"""


In [30]:
file_type = ["train","test","dev"]
for lang in languages:
    for file in file_type:
        with open(f"../data/common_voice_kpd/{lang}/{file}.csv", "r", encoding="utf-16") as f:
            lines = f.readline()
            client_id = np.array([], dtype=np.int64)
            path = np.array([])
            language = np.array([])
            label = np.array([], dtype=np.int64)
            while True:
                lines = f.readline()
                if not lines:
                    break
                lines = lines.split()
                client_id = np.append(client_id, np.int64(lines[0]))
                path = np.append(path, f"/{lang}/{file}/{lines[1]}/{lines[2]}")
                language = np.append(language, lang)
                label = np.append(label, np.int64(languages.index(lang)))

            df = (
                pl.DataFrame({
                    "client_id": client_id,
                    "path":path,
                    "language": language,
                    "label": label,
                })
            )

        df.write_csv(f"../data/common_voice_kpd/{lang}/{file}_clean.csv")

In [31]:
file_type = ["train_clean","test_clean","dev_clean"]
for file in file_type:
    combined_df = pl.DataFrame({
        "client_id": np.array([]),
        "path": np.array([]),
        "language": np.array([]),
        "label":np.array([]),
    }).with_columns(
        pl.col("*").cast(pl.String)
    )
    for lang in languages:
        df = pl.read_csv(
            f"../data/common_voice_kpd/{lang}/{file}.csv"
            ).with_columns(
                pl.col("*").cast(pl.String)
            )
        combined_df = pl.concat([combined_df, df])
    combined_df.write_csv(f"../data/{file}.csv")

In [33]:
dfTrain = pl.read_csv("../data/train_clean.csv").sample(fraction=1.0, shuffle=True)
dfTrain.head()

client_id,path,language,label
i64,str,str,i64
282,"""/Arabic/train/ara_trn_sp_121/c…","""Arabic""",42
268,"""/Hakha_Chin/train/hkch_trn_sp_…","""Hakha_Chin""",39
69,"""/Estonian/train/est_trn_sp_128…","""Estonian""",31
1205,"""/Swedish/train/swd_trn_sp_130/…","""Swedish""",22
96,"""/Interlingua/train/intl_trn_sp…","""Interlingua""",21


In [34]:
dfTrain['path'][0]

'/Arabic/train/ara_trn_sp_121/common_voice_ar_19205263.wav'

## Load wav files

In order to create the model we had to read the data (WAV)

In [3]:
import torchaudio as ta

waveform, sample_rate = ta.load("../data/common_voice_kpd/Swedish/train/swd_trn_sp_80/common_voice_sv-SE_19518866.wav")
waveform, sample_rate

(tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 6.1035e-05, 0.0000e+00,
          6.1035e-05]]),
 16000)

In [4]:
waveform.shape

torch.Size([1, 58752])

In [43]:
import torchaudio.transforms as T

mel_transform = T.MelSpectrogram(
    sample_rate=sample_rate,
    n_fft=1024,
    hop_length=512,
    n_mels=64
)
mel = mel_transform(waveform)
mel = torch.cat([mel,mel],dim=2)
mel.shape

torch.Size([1, 64, 230])

## Make Dataloader for batch

In [44]:
import torch
from torch.utils.data import Dataset
import torchaudio as ta
import torchaudio.transforms as T

SAMPLE_RATE = 16000
MEL_TRANSFORM = T.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_fft=1024,
    hop_length=512,
    n_mels=64
)

class VoiceDataset(Dataset):
    def __init__(self, df:pl.DataFrame, transform=MEL_TRANSFORM, target_sr=SAMPLE_RATE, root="."):
        self.paths = df['path']
        self.labels = df['label']
        self.transform = transform
        self.target_sr = target_sr
        self.root = root

    def __len__(self):
        return len(self.paths)
    
    def __getitem__(self, index):
        path = self.root + self.paths[index]

        waveform, sample_rate = ta.load(path)
        
        if sample_rate != self.target_sr:
            waveform = ta.functional.resample(waveform,sample_rate, self.target_sr)

        if self.transform is not None:
            features = self.transform(waveform)
            features = torch.cat([features, features], dim=2)
        else:
            features = waveform

        label = self.labels[index]

        return features, label

In [49]:
from torch.utils.data import DataLoader

dataset = VoiceDataset(dfTrain, root="../data/common_voice_kpd")

def pad_collate(batch):
    features = [item[0].squeeze(0).transpose(0,1) for item in batch]  # (n_mels, T) → (T, n_mels) pour LSTM
    labels = [item[1] for item in batch]

    lengths = torch.tensor([f.shape[0] for f in features])

    padded = torch.nn.utils.rnn.pad_sequence(features, batch_first=True)

    return padded, torch.tensor(labels), lengths

loader = DataLoader(
    dataset,
    batch_size=32,
    shuffle=True,
    collate_fn=pad_collate
)

In [60]:
dataset[6][1]

27