In my previous [notebook](https://www.kaggle.com/code/mbmmurad/mp3-to-wav-conversion), we tried to convert mp3 audio files to wav files. But it took almost 25 minutes to convert the Validation files. In this notebook we'll try a different way to convert the mp3 files to wav files using joblib. This notebook takes only 8-9 minutes to process the validation files. 
(3times faster)



**Reference** :

Idea is from this notebook : https://www.kaggle.com/code/kingabzpro/asr-mp3-to-wav-dataset?fbclid=IwAR3D61p3PW9lY-IGTOLrjf52b1_SQkc2FFuHl9kEbGumVlgHLqVOGJoQl5w

# Importing Necessary Libraries

In [1]:
import os
import cv2
import skimage.io
from tqdm.notebook import tqdm
import zipfile
import pandas as pd
import numpy as np
import shutil

from pydub import AudioSegment
from joblib import Parallel, delayed

  from .collection import imread_collection_wrapper


In [2]:
ROOT_PATH = "../data/dlsprint/train_files"
OUTPUT_DIR = "../data/dlsprint/train_files_wav"

In [3]:
os.mkdir(OUTPUT_DIR)

In [4]:
def save_fn(filename):
    
    path = f"{ROOT_PATH}/{filename}"
    save_path = f"{OUTPUT_DIR}"
    if not os.path.exists(save_path):
        os.makedirs(save_path, exist_ok=True)
    
    if os.path.exists(path):
        try:
            sound = AudioSegment.from_mp3(path)
            sound = sound.set_frame_rate(16000)
            sound.export(f"{save_path}/{filename[:-4]}.wav", format="wav")
        except:
            print(path)

In [5]:
#-------------------------------
# imports
#-------------------------------
import os 
import pandas as pd 
from tqdm.auto import tqdm
import warnings
import librosa
import io
import soundfile as sf
tqdm.pandas()
warnings.filterwarnings('ignore')


In [6]:
#---------------
# data filtering
#---------------
errors=["common_voice_bn_31727562",
        'common_voice_bn_30998934',
        'common_voice_bn_31595526',
        'common_voice_bn_31534853',
        'common_voice_bn_31518061',
        'common_voice_bn_31518373',
        'common_voice_bn_31613621',
        'common_voice_bn_31555333',
        'common_voice_bn_31772113',
        'common_voice_bn_31605391',
        'common_voice_bn_31631175',
        'common_voice_bn_31563901',
        'common_voice_bn_31691690',
        'common_voice_bn_31692010',
        'common_voice_bn_31683653',
        'common_voice_bn_31692182',
        'common_voice_bn_31519976',
        'common_voice_bn_31675793',
        'common_voice_bn_31019914',
        'common_voice_bn_31660287',
        'common_voice_bn_31660384',
        'common_voice_bn_31557261',
        'common_voice_bn_31633101',
        'common_voice_bn_31599243',
        'common_voice_bn_31521515',
        'common_voice_bn_31777802',
        'common_voice_bn_31777848',
        'common_voice_bn_31669646',
        'common_voice_bn_31566083',
        'common_voice_bn_31530331',
        'common_voice_bn_31727697',
        'common_voice_bn_31513270',
        'common_voice_bn_31686295',
        'common_voice_bn_31753693',
        'common_voice_bn_31686334',
        'common_voice_bn_31765546',
        'common_voice_bn_31765548',
        'common_voice_bn_31662742',
        'common_voice_bn_31704856',
        'common_voice_bn_31635344',
        'common_voice_bn_31618327',
        'common_voice_bn_31743074',
        'common_voice_bn_31678862',
        'common_voice_bn_31626674',
        'common_voice_bn_31626677',
        'common_voice_bn_31523889',
        'common_voice_bn_31610804',
        'common_voice_bn_31769538',
        'common_voice_bn_31533273',
        'common_voice_bn_31445621',
        'common_voice_bn_31620650']

# def filter_votes(x):
#     up=x["up_votes"]
#     down=x["down_votes"]
#     if up-down<=0:
#         return None
#     elif up==0:
#         return None
#     else:
#         return up
def filter_votes(x):
    p=x["path"]
    # avoid error data
    for pe in errors:
        if pe in p:
            return None
    up=x["up_votes"]
    down=x["down_votes"]
    # if up-down<=0:
    # if up<down:
    #     return None
    # # elif up==0:
    # #     return None
    # else:
        # return up
    return up
# ------------------------- train data----------------------------------------
train_df=pd.read_csv("../data/dlsprint/train.csv")
print("Total Data before filtering:",len(train_df))
train_df["up_votes"]=train_df.progress_apply(lambda x:filter_votes(x),axis=1)
train_df.dropna(subset = ['up_votes'],inplace=True)
print("Total Data after filtering:",len(train_df))
audio_files=train_df["path"].tolist()
len(audio_files)


Total Data before filtering: 206950


  0%|          | 0/206950 [00:00<?, ?it/s]

Total Data after filtering: 206899


206899

In [7]:
import time
start = time.time()

Parallel(n_jobs=-1, backend="multiprocessing")(
    delayed(save_fn)(filename) for filename in tqdm(audio_files)
)

end = time.time()
print("total time to process: {x} seconds".format(x=end-start))

  0%|          | 0/206899 [00:00<?, ?it/s]

total time to process: 2670.112564563751 seconds


In [13]:
ROOT_PATH = "../data/dlsprint/validation_files"
OUTPUT_DIR = "../data/dlsprint/validation_files_wav"
os.mkdir(OUTPUT_DIR)

In [14]:

# ------------------------- validation data data----------------------------------------
validation_df=pd.read_csv("../data/dlsprint/validation.csv")
print("Total Data before filtering:",len(validation_df))
validation_df["up_votes"]=validation_df.progress_apply(lambda x:filter_votes(x),axis=1)
validation_df.dropna(subset = ['up_votes'],inplace=True)
print("Total Data after filtering:",len(validation_df))
audio_files=validation_df["path"].tolist()
len(audio_files)


Total Data before filtering: 7747


  0%|          | 0/7747 [00:00<?, ?it/s]

Total Data after filtering: 7747


7747

In [15]:
import time
start = time.time()

Parallel(n_jobs=-1, backend="multiprocessing")(
    delayed(save_fn)(filename) for filename in tqdm(audio_files)
)

end = time.time()
print("total time to process: {x} seconds".format(x=end-start))

  0%|          | 0/7747 [00:00<?, ?it/s]

total time to process: 103.99121880531311 seconds
