In [None]:
import pandas as pd
import torch

In [None]:
# !pip install datasets==3.6.0
from huggingface_hub import login
login("")

In [None]:
from datasets import Dataset, DatasetDict, Audio, load_dataset

In [None]:
ds = load_dataset("Elormiden/RIK_Cypriot_Collection_Global_with_transcriptions")

In [None]:
ds_train = ds["train"].to_pandas()
ds_val = ds["validation"].to_pandas()
ds_test = ds["test"].to_pandas()

In [None]:
print(ds_train.info())
print(ds_val.info())
print(ds_test.info())

In [None]:
def clean_ds(ds):
  ds['text'].drop_duplicates(inplace=True)
  ds = ds[~ds["text"].str.contains(r'-\d+-', regex=True, na=False)] # -number- deletion
  ds.dropna(inplace=True)
  ds = ds.sample(frac=1, random_state=42) # shuffle dataset
  ds.reset_index(drop=True, inplace=True) # reseting index, memory optimization
  ds = ds[~ds["text"].str.contains(r"(.)\1{3,}", regex=True)] # delete a line, where letter repeats itself more than 3 times
  ds = ds[~ds["text"].str.strip().str.match(r"^\(.*\)$")] # deleting (music) lines
  ds = ds[~ds["text"].str.match(r"^.{0,9}$")] # delete a line, where less than 10 symbols
  return ds

In [None]:
train_ds_clean = clean_ds(ds_train)
val_ds_clean = clean_ds(ds_val)
test_ds_clean = clean_ds(ds_test)

In [None]:
print(train_ds_clean.info())
print(val_ds_clean.info())
print(test_ds_clean.info())

In [None]:
hf_train = Dataset.from_pandas(train_ds_clean)
hf_val = Dataset.from_pandas(val_ds_clean)
hf_test = Dataset.from_pandas(test_ds_clean)

In [None]:
cleaned_ds = DatasetDict()

In [None]:
cleaned_ds = DatasetDict({
    "train": hf_train,
    "validation": hf_val,
    "test": hf_test
})

In [None]:
cleaned_ds = cleaned_ds.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
cleaned_ds

In [None]:
cleaned_ds = cleaned_ds.remove_columns(["__index_level_0__"])

In [None]:
cleaned_ds.push_to_hub("Elormiden/RIK_Cypriot_Collection_Global_with_transcriptions_cleaned")