## Downloading Data

In [1]:
import requests
from tqdm.auto import tqdm

In [2]:
data_urls =  ["https://api.zindi.africa/v1/competitions/swahili-audio-classification-challenge/files/Train.csv", "https://api.zindi.africa/v1/competitions/swahili-audio-classification-challenge/files/Swahili_words.zip"
,"https://api.zindi.africa/v1/competitions/swahili-audio-classification-challenge/files/Test.csv"]
token = {'auth_token':"RdTLqJ3bNbXaqaefVwQkYm5N"}
def zindi_data_downloader(url, token, file_name):
    # Get the competition data
    competition_data = requests.post(url = data_url, data= token, stream=True)
    
    # Progress bar monitor download
    pbar = tqdm(desc=file_name, total=int(competition_data.headers.get('content-length', 0)), unit='B', unit_scale=True, unit_divisor=512)
    # Create and Write the data to colab drive in chunks
    handle = open(file_name, "wb")
    for chunk in competition_data.iter_content(chunk_size=512): # Download the data in chunks
        if chunk: # filter out keep-alive new chunks
                handle.write(chunk)
        pbar.update(len(chunk))
    handle.close()
    pbar.close()
for data_url in data_urls:
  file_name = data_url.split('/')[-1]
  zindi_data_downloader(url = data_url, token = token, file_name = file_name)


Train.csv:   0%|          | 0.00/245k [00:00<?, ?B/s]

Swahili_words.zip:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

Test.csv:   0%|          | 0.00/70.3k [00:00<?, ?B/s]

In [3]:
!unzip -q /content/Swahili_words.zip -d /content/swahili_words

## Installing Dependencies

In [4]:
%%capture

!pip install git+https://github.com/huggingface/datasets.git
!pip install git+https://github.com/huggingface/transformers.git
!pip install jiwer
!pip install torchaudio
!pip install librosa

In [5]:
%env LC_ALL=C.UTF-8
%env LANG=C.UTF-8
%env TRANSFORMERS_CACHE=/content/cache
%env HF_DATASETS_CACHE=/content/cache
%env CUDA_LAUNCH_BLOCKING=1

env: LC_ALL=C.UTF-8
env: LANG=C.UTF-8
env: TRANSFORMERS_CACHE=/content/cache
env: HF_DATASETS_CACHE=/content/cache
env: CUDA_LAUNCH_BLOCKING=1


In [6]:
import numpy as np
import pandas as pd

from pathlib import Path
from tqdm import tqdm

import tensorflow
import torchaudio
from sklearn.model_selection import train_test_split

import os
import sys

## Data Processing

In [7]:
df = pd.read_csv("/content/Train.csv")
df.head()

Unnamed: 0,Word_id,Swahili_word,English_translation
0,id_v8rz06e6rv31.wav,mbili,two
1,id_vmbwicdpfn68.wav,tatu,three
2,id_injlouhxg1hg.wav,ndio,yes
3,id_gdq23p6xgzya.wav,nne,four
4,id_lpstklz51zpz.wav,nane,eight


In [8]:
df['path'] = "/content/swahili_words/" + df['Word_id']

In [9]:
# Filter broken and non-existed paths

print(f"Step 0: {len(df)}")

df["status"] = df["path"].apply(lambda path: True if os.path.exists(path) else None)
df = df.dropna(subset=["path"])
df = df.drop("status", 1)
print(f"Step 1: {len(df)}")

df = df.sample(frac=1)
df = df.reset_index(drop=True)
df.head()

Step 0: 4200
Step 1: 4200


  import sys


Unnamed: 0,Word_id,Swahili_word,English_translation,path
0,id_j6hsq1dcl82k.wav,tatu,three,/content/swahili_words/id_j6hsq1dcl82k.wav
1,id_g1jm3297ckjs.wav,tatu,three,/content/swahili_words/id_g1jm3297ckjs.wav
2,id_af2gwe4yn6x3.wav,nne,four,/content/swahili_words/id_af2gwe4yn6x3.wav
3,id_ohj7j97z85ec.wav,sita,six,/content/swahili_words/id_ohj7j97z85ec.wav
4,id_cz5vtgjmhxk3.wav,tano,five,/content/swahili_words/id_cz5vtgjmhxk3.wav


Let's explore how many labels (emotions) are in the dataset with what distribution.

In [10]:
print("Labels: ", df["Swahili_word"].unique())
print()
df.groupby("Swahili_word").count()[["path"]]

Labels:  ['tatu' 'nne' 'sita' 'tano' 'tisa' 'kumi' 'moja' 'saba' 'ndio' 'hapana'
 'nane' 'mbili']



Unnamed: 0_level_0,path
Swahili_word,Unnamed: 1_level_1
hapana,350
kumi,350
mbili,350
moja,350
nane,350
ndio,350
nne,350
saba,350
sita,350
tano,350


In [11]:
import torchaudio
import librosa
import IPython.display as ipd
import numpy as np

idx = np.random.randint(0, len(df))
sample = df.iloc[idx]
path = sample["path"]
label = sample["Swahili_word"]


print(f"ID Location: {idx}")
print(f"      Label: {label}")
print()

speech, sr = torchaudio.load(path)
speech = speech[0].numpy().squeeze()
speech = librosa.resample(np.asarray(speech), sr, 16_000)
ipd.Audio(data=np.asarray(speech), autoplay=True, rate=16000)

ID Location: 2680
      Label: tatu



For training purposes, we need to split data into train test sets; in this specific example, we break with a `20%` rate for the test set.

In [12]:
import os
save_path = "/content/data_split"
os.mkdir(save_path)
train_df, test_df = train_test_split(df, test_size=0.1, random_state=101, stratify=df["Swahili_word"])

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_df.to_csv(f"{save_path}/train.csv", sep="\t", encoding="utf-8", index=False)
test_df.to_csv(f"{save_path}/val.csv", sep="\t", encoding="utf-8", index=False)


print(train_df.shape)
print(test_df.shape)

(3780, 4)
(420, 4)


## Prepare Data for Training

In [13]:
# Loading the created dataset using datasets
from datasets import load_dataset, load_metric


data_files = {
    "train": "/content/data_split/train.csv", 
    "validation": "/content/data_split/val.csv",
}

dataset = load_dataset("csv", data_files=data_files, delimiter="\t", )
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]

print(train_dataset)
print(eval_dataset)

Using custom data configuration default-1db2547214df2bc8


Downloading and preparing dataset csv/default to /content/cache/csv/default-1db2547214df2bc8/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /content/cache/csv/default-1db2547214df2bc8/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Dataset({
    features: ['Word_id', 'Swahili_word', 'English_translation', 'path'],
    num_rows: 3780
})
Dataset({
    features: ['Word_id', 'Swahili_word', 'English_translation', 'path'],
    num_rows: 420
})


In [14]:
# We need to specify the input and output column
input_column = "path"
output_column = "Swahili_word"

In [15]:
label_list = train_dataset.unique(output_column)
label_list.sort()  # Let's sort it for determinism
num_labels = len(label_list)
print(f"A classification problem with {num_labels} classes: {label_list}")

A classification problem with 12 classes: ['hapana', 'kumi', 'mbili', 'moja', 'nane', 'ndio', 'nne', 'saba', 'sita', 'tano', 'tatu', 'tisa']


In [16]:
from transformers import AutoConfig, Wav2Vec2Processor

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [17]:
model_name_or_path = "alokmatta/wav2vec2-large-xlsr-53-sw"
pooling_mode = "mean"

In [18]:
# config
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    finetuning_task="wav2vec2_clf",
)
setattr(config, 'pooling_mode', pooling_mode)

Downloading:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


In [19]:
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)
target_sampling_rate = processor.feature_extractor.sampling_rate
print(f"The target sampling rate: {target_sampling_rate}")

Downloading:   0%|          | 0.00/158 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/138 [00:00<?, ?B/s]

vocab_file vocab.json


  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

tokenizer_config_file tokenizer_config.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json


Downloading:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


The target sampling rate: 16000


# Preprocess Data

In [20]:
def speech_file_to_array_fn(path):
    speech_array, sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech

def label_to_id(label, label_list):

    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1

    return label

def preprocess_function(examples):
    speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
    target_list = [label_to_id(label, label_list) for label in examples[output_column]]

    result = processor(speech_list, sampling_rate=target_sampling_rate)
    result["labels"] = list(target_list)

    return result

In [21]:
import tensorflow
import torchaudio

In [22]:
train_dataset = train_dataset.map(
    preprocess_function,
    batch_size=10,
    batched=True,
    num_proc=4
)
eval_dataset = eval_dataset.map(
    preprocess_function,
    batch_size=10,
    batched=True,
    num_proc=4
)

     

#0:   0%|          | 0/95 [00:00<?, ?ba/s]

 

  tensor = as_tensor(value)


  

#1:   0%|          | 0/95 [00:00<?, ?ba/s]

  tensor = as_tensor(value)


#2:   0%|          | 0/95 [00:00<?, ?ba/s]

#3:   0%|          | 0/95 [00:00<?, ?ba/s]

  tensor = as_tensor(value)
  tensor = as_tensor(value)


        

#0:   0%|          | 0/11 [00:00<?, ?ba/s]

#1:   0%|          | 0/11 [00:00<?, ?ba/s]

  tensor = as_tensor(value)
  tensor = as_tensor(value)


#2:   0%|          | 0/11 [00:00<?, ?ba/s]

#3:   0%|          | 0/11 [00:00<?, ?ba/s]

  tensor = as_tensor(value)
  tensor = as_tensor(value)


In [23]:
idx = 0
print(f"Training input_values: {train_dataset[idx]['input_values']}")
print(f"Training attention_mask: {train_dataset[idx]['attention_mask']}")
print(f"Training labels: {train_dataset[idx]['labels']} - {train_dataset[idx]['Swahili_word']}")

Training input_values: [0.0008505519363097847, 0.0008505519363097847, 0.0008505519363097847, 0.0008505519363097847, 0.0008505519363097847, 0.0008505519363097847, 0.0008505519363097847, 0.0008505519363097847, 0.0008505519363097847, 0.0008505519363097847, 0.0008505519363097847, 0.0008505519363097847, 0.0008505519363097847, 0.0008505519363097847, 0.0008505519363097847, 0.0008505519363097847, 0.0008505519363097847, 0.0008505519363097847, 0.0008505519363097847, 0.0008505519363097847, 0.0008505519363097847, 0.0008505519363097847, 0.0008505519363097847, 0.0008505519363097847, 0.0008505519363097847, 0.00025972240837290883, 0.00025972240837290883, 0.00025972240837290883, 0.00025972240837290883, 0.00025972240837290883, 0.00025972240837290883, 0.00025972240837290883, 0.00025972240837290883, 0.00025972240837290883, 0.00025972240837290883, 0.00025972240837290883, -0.00033110714866779745, 0.00025972240837290883, 0.00025972240837290883, -0.00033110714866779745, 0.00025972240837290883, 0.0002597224083

Great, now we've successfully read all the audio files, resampled the audio files to 16kHz, and mapped each audio to the corresponding label.

## Model

Before diving into the training part, we need to build our classification model based on the merge strategy. 

In [24]:
from dataclasses import dataclass
from typing import Optional, Tuple
import torch
from transformers.file_utils import ModelOutput


@dataclass
class SpeechClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


In [25]:
import torch
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2PreTrainedModel,
    Wav2Vec2Model
)


class Wav2Vec2ClassificationHead(nn.Module):
    """Head for wav2vec classification task."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x


class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.pooling_mode = config.pooling_mode
        self.config = config

        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = Wav2Vec2ClassificationHead(config)

        self.init_weights()

    def freeze_feature_extractor(self):
        self.wav2vec2.feature_extractor._freeze_parameters()

    def merged_strategy(
            self,
            hidden_states,
            mode="mean"
    ):
        if mode == "mean":
            outputs = torch.mean(hidden_states, dim=1)
        elif mode == "sum":
            outputs = torch.sum(hidden_states, dim=1)
        elif mode == "max":
            outputs = torch.max(hidden_states, dim=1)[0]
        else:
            raise Exception(
                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")

        return outputs

    def forward(
            self,
            input_values,
            attention_mask=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            labels=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.wav2vec2(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = outputs[0]
        hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
        logits = self.classifier(hidden_states)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SpeechClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


### Set-up Trainer

Let's start by defining the data collator. The code for the data collator was copied from [this example](https://github.com/huggingface/transformers/blob/9a06b6b11bdfc42eea08fa91d0c737d1863c99e3/examples/research_projects/wav2vec2/run_asr.py#L81).

Without going into too many details, in contrast to the common data collators, this data collator treats the `input_values` and `labels` differently and thus applies to separate padding functions on them (again making use of XLSR-Wav2Vec2's context manager). This is necessary because in speech input and output are of different modalities meaning that they should not be treated by the same padding function.
Analogous to the common data collators, the padding tokens in the labels with `-100` so that those tokens are **not** taken into account when computing the loss.

In [26]:
from dataclasses import dataclass
from typing import Dict, List, Optional, Union
import torch

import transformers
from transformers import Wav2Vec2Processor


@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [feature["labels"] for feature in features]

        d_type = torch.long if isinstance(label_features[0], int) else torch.float

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch["labels"] = torch.tensor(label_features, dtype=d_type)

        return batch

In [27]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

Next, the evaluation metric is defined. There are many pre-defined metrics for classification/regression problems, but in this case, we would continue with just **Accuracy** for classification and **MSE** for regression. You can define other metrics on your own.

In [28]:
is_regression = False

In [29]:
import numpy as np
from transformers import EvalPrediction


def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)

    if is_regression:
        return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
    else:
        return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

Now, we can load the pretrained XLSR-Wav2Vec2 checkpoint into our classification model with a pooling strategy.

In [30]:
model = Wav2Vec2ForSpeechClassification.from_pretrained(
    model_name_or_path,
    config=config,
)

Downloading:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of the model checkpoint at alokmatta/wav2vec2-large-xlsr-53-sw were not used when initializing Wav2Vec2ForSpeechClassification: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing Wav2Vec2ForSpeechClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSpeechClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSpeechClassification were not initialized from the model checkpoint at alokmatta/wav2vec2-large-xlsr-53-sw and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream t

In [31]:
model.freeze_feature_extractor()

In [32]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/wav2vec2-xlsr-swahili-speech-recognition",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=5,
    evaluation_strategy="steps",
    num_train_epochs=2.0,
    fp16=False,
    save_steps=250,
    eval_steps=250,
    logging_steps=100,
    learning_rate=1e-4,
    save_total_limit=4,
)

In [33]:
from typing import Any, Dict, Union

import torch
from packaging import version
from torch import nn

from transformers import (
    Trainer,
    is_apex_available,
)

if is_apex_available():
    from apex import amp

if version.parse(torch.__version__) >= version.parse("1.6"):
    _is_native_amp_available = True
    from torch.cuda.amp import autocast


Now, all instances can be passed to Trainer and we are ready to start training!

In [34]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor.feature_extractor,
)

### Training

In [35]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path, Swahili_word, English_translation, Word_id. If path, Swahili_word, English_translation, Word_id are not expected by `Wav2Vec2ForSpeechClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3780
  Num Epochs = 2
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 5
  Total optimization steps = 756


Step,Training Loss,Validation Loss,Accuracy
250,0.1441,0.087838,0.983333
500,0.0604,0.090725,0.980952
750,0.0852,0.078872,0.983333


The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path, Swahili_word, English_translation, Word_id. If path, Swahili_word, English_translation, Word_id are not expected by `Wav2Vec2ForSpeechClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 420
  Batch size = 4
Saving model checkpoint to /content/wav2vec2-xlsr-swahili-speech-recognition/checkpoint-250
Configuration saved in /content/wav2vec2-xlsr-swahili-speech-recognition/checkpoint-250/config.json
Model weights saved in /content/wav2vec2-xlsr-swahili-speech-recognition/checkpoint-250/pytorch_model.bin
Feature extractor saved in /content/wav2vec2-xlsr-swahili-speech-recognition/checkpoint-250/preprocessor_config.json
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: path, Swahili_word

TrainOutput(global_step=756, training_loss=0.2254573251835253, metrics={'train_runtime': 3015.5482, 'train_samples_per_second': 2.507, 'train_steps_per_second': 0.251, 'total_flos': 1.3092739181093683e+18, 'train_loss': 0.2254573251835253, 'epoch': 2.0})

## Evaluation

In [36]:
import librosa
from sklearn.metrics import classification_report

In [37]:
test_data = pd.read_csv("/content/Test.csv")
test_data.head()

Unnamed: 0,Word_id
0,id_jp2pxl0r84ya.wav
1,id_ndduqqvthbpx.wav
2,id_36oxymxfcm6q.wav
3,id_ue9b0to760pg.wav
4,id_prja4oprb914.wav


In [38]:
test_data['path'] = "/content/swahili_words/" + test_data['Word_id']
test_data.to_csv("/content/data_split/test_data.csv", sep="\t", encoding="utf-8", index=False)

In [39]:
test_dataset = load_dataset("csv", data_files={"test": "/content/data_split/test_data.csv"}, delimiter="\t")["test"]
test_dataset

Using custom data configuration default-3591b9f91d2f2a85


Downloading and preparing dataset csv/default to /content/cache/csv/default-3591b9f91d2f2a85/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /content/cache/csv/default-3591b9f91d2f2a85/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['Word_id', 'path'],
    num_rows: 1800
})

In [40]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cuda


In [52]:
model_name_or_path = "/content/wav2vec2-xlsr-swahili-speech-recognition/checkpoint-750"
processor_path =  "alokmatta/wav2vec2-large-xlsr-53-sw"
config = AutoConfig.from_pretrained(model_name_or_path)
processor = Wav2Vec2Processor.from_pretrained(processor_path)
model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path).to(device)

loading configuration file /content/wav2vec2-xlsr-swahili-speech-recognition/checkpoint-750/config.json
Model config Wav2Vec2Config {
  "_name_or_path": "/content/wav2vec2-xlsr-swahili-speech-recognition/checkpoint-750",
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForSpeechClassification"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 256,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": true,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "mean",
  "ctc_zero_infinity": true,
  "diversity_loss_weight": 0.1,
  "do_stable_layer_norm": true,
  "eos_token_id": 2,
  "feat_extract_activation": "gelu",
  "

vocab_file vocab.json
tokenizer_config_file tokenizer_config.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json


loading configuration file config.json from cache at /content/cache/models--alokmatta--wav2vec2-large-xlsr-53-sw/snapshots/125fde65ac78845894cc4b67f57ea21c807ce371/config.json
Model config Wav2Vec2Config {
  "_name_or_path": "alokmatta/wav2vec2-large-xlsr-53-sw",
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForCTC"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 256,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": true,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "mean",
  "ctc_zero_infinity": true,
  "diversity_loss_weight": 0.1,
  "do_stable_layer_norm": true,
  "eos_token_id": 2,
  "feat_extrac

In [53]:
def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = torchaudio.load(batch["path"])
    speech_array = speech_array.squeeze().numpy()
    speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, processor.feature_extractor.sampling_rate)

    batch["speech"] = speech_array
    return batch



In [54]:
test_dataset = test_dataset.map(speech_file_to_array_fn)

  0%|          | 0/1800 [00:00<?, ?ex/s]

In [55]:

def predict(batch):
    features = processor(batch["speech"], sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt", padding=True)

    input_values = features.input_values.to(device)
    attention_mask = features.attention_mask.to(device)

    with torch.no_grad():
        logits = model(input_values, attention_mask=attention_mask).logits 

    pred_ids = torch.softmax(logits, dim=-1).detach().cpu().numpy()
    batch["predicted"] = pred_ids
    return batch

In [56]:
result = test_dataset.map(predict, batched=True, batch_size=4)

  0%|          | 0/450 [00:00<?, ?ba/s]

In [None]:
print(result["predicted"][:2])

In [57]:
label_names = [config.id2label[i] for i in range(config.num_labels)]
label_names

['hapana',
 'kumi',
 'mbili',
 'moja',
 'nane',
 'ndio',
 'nne',
 'saba',
 'sita',
 'tano',
 'tatu',
 'tisa']

In [58]:
test_result = pd.DataFrame()

In [59]:
test_result['Word_id'] = result['Word_id']
test_result[['hapana', 'kumi', 'mbili', 'moja', 'nane', 'ndio', 'nne', 'saba', 'sita', 'tano', 'tatu', 'tisa']] = result['predicted']

In [60]:
test_result.head()

Unnamed: 0,Word_id,hapana,kumi,mbili,moja,nane,ndio,nne,saba,sita,tano,tatu,tisa
0,id_jp2pxl0r84ya.wav,0.000156,6.4e-05,0.000134,0.000249,5.3e-05,0.000151,3e-05,0.000353,0.998373,0.000237,0.0001,0.000101
1,id_ndduqqvthbpx.wav,3.3e-05,0.000168,0.000138,9e-05,0.000149,0.000129,9.1e-05,0.000136,0.000212,0.00012,0.000112,0.998621
2,id_36oxymxfcm6q.wav,3.4e-05,0.000195,0.000169,9.5e-05,0.000152,0.000138,0.000101,0.000105,0.000197,0.000111,0.000104,0.998598
3,id_ue9b0to760pg.wav,0.000257,6.8e-05,6.9e-05,0.000151,0.000157,3.7e-05,7e-05,8.7e-05,0.000178,0.998526,0.000285,0.000115
4,id_prja4oprb914.wav,0.000378,0.000203,6.6e-05,5.9e-05,0.998166,8e-05,0.000322,0.000257,6.2e-05,0.00021,3e-05,0.000168


In [61]:
test_result.to_csv('500_ck_sub.csv', index=False)