In [9]:
import torch
import librosa
from spellchecker import SpellChecker
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

In [10]:
# Training model used
# Loading the tarining model to the system
MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic"

processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)



In [15]:
product_path = ['Aata_Train','Achaar_Train', 'Anday_Train', 'Baisan_Train' , 'Channay_Train', 'Chawal_Train',
               'Cheeni_Train', 'Dabal_Roti_Train', 'Dahi_Train', 'Doodh_Train', 'Elaichi_Train',
               'Ghee_Train', 'Haldi_Train', 'Imli_Train', 'Namak_Train', 'Patti_Train', 'Sabun_Train',
               'Sirka_Train', 'Surf_Train', 'Tael_Train']

product_path_Test = ['Aata_Test','Achaar_Test', 'Anday_Test', 'Baisan_Test' , 'Channay_Test', 'Chawal_Test',
               'Cheeni_Test', 'Dabal_Roti_Test', 'Dahi_Test', 'Doodh_Test', 'Elaichi_Test',
               'Ghee_Test', 'Haldi_Test', 'Imli_Test', 'Namak_Test', 'Patti_Test', 'Sabun_Test',
               'Sirka_Test', 'Surf_Test', 'Tael_Test']


In [16]:
# Dictionary of words for spell checker
word_list = ['أتار', 'عندي', 'أندي', 'بسن', 'بيسن', 'جني', 'شاني', 'أتى', 'أعتا', 'شعول', 'جاول',
            'جيني', 'دبروتي', 'دبل روتي', 'دهي', 'دود', 'إلعَيْتي', 'إلعيتي', 'إِلَ عَيْتِي', 'إلى أيتي',
            'كي', 'غي', 'حل دي', 'حلدي', 'إملي', 'نمك', 'بتي', 'صاب', 'سعب', 'سركا', 'سرف', 'تيل']

# Used to calculate accuracy of each product
word_dict = {0: ['أتى', 'أعتا'],
            1: ['أتار'],
            2: ['عندي', 'أندي'],
            3: ['بيسن', 'بسن'],
            4: ['جني', 'شاني'],
            5: ['شعول', 'جاول'],
            6: ['جيني'],
            7: ['دبل روتي', 'دبروتي'],
            8: ['دهي'],
            9: ['دود'],
            10: ['إلعَيْتي', 'إلعيتي', 'إِلَ عَيْتِي', 'إلى أيتي'],
            11: ['غي', 'كي'],
            12: ['حل دي', 'حلدي'],
            13: ['إملي'],
            14: ['نمك'],
            15: ['بتي'],
            16: ['صاب', 'سعب'],
            17: ['سركا'],
            18: ['سرف'],
            19: ['تيل']}

# Stores accuracy of each product
dict_accuracy = {}

# SpellChecker 
# Edit Distance = 5
spell = SpellChecker(distance=5)
spell.word_frequency.load_words(word_list)

In [17]:
#Loop through each product
for num, product in enumerate(product_path_Test):
    
    # Load audio files of each product     
    path_audio = "../Dataset/Grocery_Dataset_Complete_Folders/"+product
    data = librosa.util.find_files(path_audio, ext=['wav'])
    
    # Pass each audio file to the model. Store the result in a list
    pred_word = []
    for vn in data:

        # Load file         
        voice_data = librosa.load(vn, sr=16000, mono=True)

        inputs = processor(voice_data[0], sampling_rate=16_000, return_tensors="pt", padding=True)
        with torch.no_grad():
            logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        predicted_word = processor.batch_decode(predicted_ids)

        # Append the word to the list
        pred_word.append(predicted_word)
        
    # Corrects the predicted word according to the dictionary     
    corrected_word = []
    
    for pred in pred_word:
        corrected_word.append(spell.correction(pred[0]))
    
    correctly_pred = 0

    # For each corrected word find the accuracy to
    # how to close all the audio files were correctly
    # mapped     
    for word in corrected_word:
        actual_word = word_dict[num]

        if word in actual_word:
            correctly_pred += 1

    # Finding Percentage of Correctly Predicted
    accuracy = (float(correctly_pred)/float(len(corrected_word))) * 100
    
    # Store the accuracy percentage w.r.t to the 
    # product in a dictionary
    dict_accuracy[product_path_Test[num]] = accuracy
    
    # Reset all the lists
    pred_word.clear()
    corrected_word.clear()
    
    print(num)

    

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


In [18]:
for key, value in dict_accuracy.items():
    print(key, ' : ', value)

Aata_Test  :  60.0
Achaar_Test  :  77.77777777777779
Anday_Test  :  100.0
Baisan_Test  :  100.0
Channay_Test  :  90.0
Chawal_Test  :  88.88888888888889
Cheeni_Test  :  100.0
Dabal_Roti_Test  :  50.0
Dahi_Test  :  90.0
Doodh_Test  :  90.0
Elaichi_Test  :  70.0
Ghee_Test  :  90.0
Haldi_Test  :  90.0
Imli_Test  :  100.0
Namak_Test  :  90.0
Patti_Test  :  77.77777777777779
Sabun_Test  :  90.0
Sirka_Test  :  70.0
Surf_Test  :  90.0
Tael_Test  :  90.0


Following result was accomp on completely unseen data