# Preprocessing

In [2]:
import librosa
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
from typing import Any
from sklearn.model_selection import train_test_split
from utils import RAW_DATA_PATH, CSV_PATH
from numpy.typing import NDArray

type FloatArray = NDArray[np.floating[Any]]

sns.set_style("whitegrid")

In [3]:
df = pd.read_csv(CSV_PATH)
df = df.drop(['Speaker ID', 'Number of Words'], axis=1)
df['Pronunciation'] = np.int_(df['Pronunciation'] == 'Wrong')
df['Word ID'] = df['Word ID'].str.extract(r'^(\d+)').astype(int)[0]

def load_audio(path: str, sr: int = 22050) -> dict[str, FloatArray | int]:
    y, sr = librosa.load(RAW_DATA_PATH / path, sr=sr)
    return {'values': y, 'sr': sr}

df['Audio'] = df['Path'].apply(lambda x: load_audio(x)['values'])
df['SR'] = df['Path'].apply(lambda x: load_audio(x)['sr'])
df.drop('Path', axis=1, inplace=True)
df['Gender'] = np.int_(df['Gender'] == 'Boy')

target = 'Pronunciation'
X, y = df.drop(target, axis=1), df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=df[target], test_size=0.2, random_state=42)

In [None]:
train_df = X_train.copy()
train_df[target] = y_train

test_df = X_test.copy()
test_df[target] = y_test

train_df.to_feather(RAW_DATA_PATH / 'train.feather.lz4', compression='lz4') 
test_df.to_feather(RAW_DATA_PATH / 'test.feather.lz4', compression='lz4')  