In [1]:
import pandas as pd
import numpy as np 

import tensorflow as tf

import os

import librosa

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder



In [2]:
# Load CSVs

train_df = pd.read_csv('cv-corpus-10.0-delta-2022-07-04-sw/cv-corpus-10.0-delta-2022-07-04/sw/train.tsv', sep='\t')
test_df = pd.read_csv('cv-corpus-10.0-delta-2022-07-04-sw/cv-corpus-10.0-delta-2022-07-04/sw/test.tsv', sep='\t')



In [4]:
train_df.head(2)

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accents,locale,segment
0,0ac2b2e3e44a446fb9f6989da2b67a98ea65b963ef35ea...,common_voice_sw_31911896.mp3,Unatakiwa kumaliza kazi mapema.,12,0,thirties,male,,sw,
1,0ac2b2e3e44a446fb9f6989da2b67a98ea65b963ef35ea...,common_voice_sw_31940313.mp3,Kama fuza wana wengi,2,0,thirties,male,,sw,


In [5]:
test_df.head(2)

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accents,locale,segment
0,184cb4d570c6c8ed1de5cbd2efeae921de818db23178ab...,common_voice_sw_32195361.mp3,Chuza vyema,2,1,,,,sw,
1,1cc634ad9e9493ddcb81222bfc9c4703bbf19e38a2cbfe...,common_voice_sw_32366407.mp3,Watoto watukutu wamfuja mimeangwa,2,0,,,,sw,


In [6]:
merged_df = pd.concat([train_df, test_df], ignore_index=True)


In [7]:
import pandas as pd
from collections import Counter

# Create DataFrame
data = merged_df[['path','sentence']]
train_df = pd.DataFrame(data)

# Count the occurrences of each word in the entire dataset
all_words = ' '.join(train_df['sentence']).split()
word_counts = Counter(all_words)

# Function to check if a sentence can be moved to the test set
def can_move_to_test_set(sentence):
    words = sentence.split()
    for word in words:
        if word_counts[word] <= 1:
            return False
        word_counts[word] -= 1

    # Check if each word still appears in the remaining dataset
    for word in words:
        if word_counts[word] == 0:
            for w in words:
                word_counts[w] += 1  # Rollback the changes
            return False

    return True

# Form the test set
test_set = train_df[train_df['sentence'].apply(can_move_to_test_set)]

# The remaining data is the training set
train_set = train_df.drop(test_set.index)

# Reset word_counts to its original state
word_counts = Counter(all_words)


In [8]:
common = pd.merge(train_df, test_df, on=['path', 'sentence'])

# Getting the indexes of the common rows in train_df
common_indexes = train_df.merge(common, on=['path', 'sentence']).index

# Dropping the common rows from train_df by using the indexes
train_df = train_df.drop(common_indexes)


In [9]:
test_df = test_df[['path', 'sentence']]

In [10]:
# Function to extract features
def extract_features(audio_path):
    y, sr = librosa.load(audio_path, sr=None, duration=5)  
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    return mfccs.T

In [12]:
# pip install Numpy==1.21.4

In [13]:
# pip install numba==0.53.0

In [11]:
base_dir = 'cv-corpus-10.0-delta-2022-07-04-sw/cv-corpus-10.0-delta-2022-07-04/sw/clips'
X_train = np.array([extract_features(os.path.join(base_dir, path)) for path in train_df['path']])
X_test = np.array([extract_features(os.path.join(base_dir, path)) for path in test_df['path']])


  return f(*args, **kwargs)


In [12]:
len(X_train)

341

In [13]:
len(X_test)

302

In [14]:
le = LabelEncoder()

y_train = le.fit_transform(train_df['sentence'])
y_test = le.transform(test_df['sentence'])


In [15]:
# Build Model
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(None, 13)), 
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256)),
    tf.keras.layers.Dense(len(np.unique(y_train)), activation='softmax')
])

# Compile Model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print Model Summary
model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional (Bidirectional (None, None, 512)         552960    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 512)               1574912   
_________________________________________________________________
dense (Dense)                (None, 341)               174933    
Total params: 2,302,805
Trainable params: 2,302,805
Non-trainable params: 0
_________________________________________________________________


In [16]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


(341,)
(341,)
(302,)
(302,)


In [17]:
print(type(X_train[0]))
print(X_train[0].shape)


<class 'numpy.ndarray'>
(313, 13)


In [18]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assuming X_train is a list of 2D arrays
X_train = pad_sequences(X_train, padding='post', dtype='float32')
X_test = pad_sequences(X_test, padding='post', dtype='float32')


In [19]:
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=30, batch_size=32)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [20]:
# Import IPython.display.Audio for playing audio
from IPython.display import Audio, display

# Choose a random sample or specific one
sample = train_df.sample(1)
audio_path = os.path.join(base_dir, sample['path'].values[0])
transcription = sample['sentence'].values[0]

# Play the audio
display(Audio(audio_path))

# Print the transcription
print("Transcription:", transcription)


Transcription: Wezi wamehsisha wanafunzi


In [21]:
# Import IPython.display.Audio for playing audio
from IPython.display import Audio, display

# Choose a random sample or specific one
sample = train_df.sample(2)
audio_path = os.path.join(base_dir, sample['path'].values[0])
transcription = sample['sentence'].values[0]

# Play the audio
display(Audio(audio_path))

# Print the transcription
print("Transcription:", transcription)


Transcription: Eo mwedhi umeonekana kesho ni ramadhani


In [22]:
# Import IPython.display.Audio for playing audio
from IPython.display import Audio, display

# Choose a random sample or specific one
sample = test_df.sample(1)
audio_path = os.path.join(base_dir, sample['path'].values[0])
transcription = sample['sentence'].values[0]

# Play the audio
display(Audio(audio_path))

# Print the transcription
print("Transcription:", transcription)


Transcription: Asha kampata kalamu mpya


In [23]:
# Import IPython.display.Audio for playing audio
from IPython.display import Audio, display

# Choose a random sample or specific one
sample = test_df.sample(2)
audio_path = os.path.join(base_dir, sample['path'].values[0])
transcription = sample['sentence'].values[0]

# Play the audio
display(Audio(audio_path))

# Print the transcription
print("Transcription:", transcription)


Transcription: Nchu ambae hueya vanyama


In [24]:
train_df[train_df['sentence'] == 'kauza']

Unnamed: 0,path,sentence


In [25]:
train_df['sentence'] 

302      Yapo maneno ambayo mmeyaswahilisha wenyewe.
303    Ah Mwanamme yule anawivu na mkewe kwelikweli.
304                    Kesho tutaenda kazini mapema.
305                         Vijana wamebadilika sana
306                              Sana twatumia ndugu
                           ...                      
638                           Jambo baya, jambo zuri
639                      Tunataka kuanzisha Taasisi.
640                    Sada kajibu swali aliloulizwa
641    Watoto wameharibu mashine ya kunyolea nywele.
642                             Habib kauza nyumbaye
Name: sentence, Length: 341, dtype: object

In [26]:
train_df['sentence']

302      Yapo maneno ambayo mmeyaswahilisha wenyewe.
303    Ah Mwanamme yule anawivu na mkewe kwelikweli.
304                    Kesho tutaenda kazini mapema.
305                         Vijana wamebadilika sana
306                              Sana twatumia ndugu
                           ...                      
638                           Jambo baya, jambo zuri
639                      Tunataka kuanzisha Taasisi.
640                    Sada kajibu swali aliloulizwa
641    Watoto wameharibu mashine ya kunyolea nywele.
642                             Habib kauza nyumbaye
Name: sentence, Length: 341, dtype: object

In [27]:
filtered_df = train_df[train_df['sentence'].str.contains('upepo', case=False, na=False)]


In [28]:
filtered_df

Unnamed: 0,path,sentence
315,common_voice_sw_31997172.mp3,Upepo watoka upande huu
327,common_voice_sw_31997397.mp3,Kama jamaa akiwa anadungadunga watu\t4\t0\ttwe...
433,common_voice_sw_32311566.mp3,Bahari inchafuka kwa upepo
592,common_voice_sw_32264809.mp3,Paa lya nyumba lyekugwa kwa upepo
