In [5]:
import os
import time
import numpy as np
import pandas as pd
import librosa
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm
import multiprocessing
import gc

# Konstanta
SAMPLE_RATE = 16000
N_FFT = 2048
CHUNK_SIZE = 5000
CLEAR_CACHE_INTERVAL = 5  # Clear cache setiap 5 chunk

def clear_cache():
    librosa.cache.clear()
    gc.collect()

def load_and_extract_features(file_path, label):
    try:
        audio_data, _ = librosa.load(file_path, sr=SAMPLE_RATE, duration=30)
        
        features = {}
        stft = np.abs(librosa.stft(audio_data, n_fft=N_FFT))
        
        features['chroma_stft'] = np.mean(librosa.feature.chroma_stft(S=stft, sr=SAMPLE_RATE))
        features['rms'] = np.mean(librosa.feature.rms(y=audio_data))
        features['spectral_centroid'] = np.mean(librosa.feature.spectral_centroid(y=audio_data, sr=SAMPLE_RATE))
        features['spectral_bandwidth'] = np.mean(librosa.feature.spectral_bandwidth(y=audio_data, sr=SAMPLE_RATE))
        features['spectral_rolloff'] = np.mean(librosa.feature.spectral_rolloff(y=audio_data, sr=SAMPLE_RATE))
        features['zero_crossing_rate'] = np.mean(librosa.feature.zero_crossing_rate(y=audio_data))
        
        mfccs = librosa.feature.mfcc(y=audio_data, sr=SAMPLE_RATE, n_mfcc=20)
        for i, mfcc in enumerate(mfccs):
            features[f'mfcc_{i+1}'] = np.mean(mfcc)
        
        features['label'] = label
        features['file_path'] = file_path
        return features
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

def process_chunk(chunk):
    return [load_and_extract_features(file_path, label) for file_path, label in chunk]

def main(dataset_path):
    print("Memproses data audio...")
    start_time = time.time()

    files_to_process = []
    for language_folder in os.listdir(dataset_path):
        language_folder_path = os.path.join(dataset_path, language_folder)
        if os.path.isdir(language_folder_path):
            files_to_process.extend([(os.path.join(language_folder_path, file), language_folder) 
                                     for file in os.listdir(language_folder_path) if file.endswith(".wav")])

    chunks = [files_to_process[i:i + CHUNK_SIZE] for i in range(0, len(files_to_process), CHUNK_SIZE)]

    num_workers = multiprocessing.cpu_count() * 2

    all_features = []
    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        for i, result in enumerate(tqdm(executor.map(process_chunk, chunks), total=len(chunks), desc="Processing chunks")):
            all_features.extend([item for item in result if item])
            
            if (i + 1) % CLEAR_CACHE_INTERVAL == 0:
                clear_cache()
                print(f"Cache cleared after chunk {i+1}")
            
            # Simpan hasil setiap 50000 file yang diproses
            if len(all_features) >= 50000:
                df = pd.DataFrame(all_features)
                df.to_csv('audio_features_partial.csv', mode='a', header=not os.path.exists('audio_features_partial.csv'), index=False)
                all_features = []
                print(f"Partial results saved after chunk {i+1}")

    # Simpan sisa hasil
    if all_features:
        df = pd.DataFrame(all_features)
        df.to_csv('audio_features_partial.csv', mode='a', header=not os.path.exists('audio_features_partial.csv'), index=False)

    end_time = time.time()
    print(f"Selesai! Fitur telah disimpan ke 'audio_features_partial.csv'")
    print(f"Total waktu pemrosesan: {end_time - start_time:.2f} detik")

    # Hitung jumlah total file yang diproses
    total_processed = sum(1 for _ in open('audio_features_partial.csv')) - 1  # -1 untuk header
    print(f"Jumlah total file yang diproses: {total_processed}")

if __name__ == "__main__":
    dataset_path = '/kaggle/input/preprocess-common-language/processed audio dataset'
    main(dataset_path)

Memproses data audio...


  return pitch_tuning(
Processing chunks:  24%|██▍       | 5/21 [47:55<1:48:37, 407.31s/it]  

Cache cleared after chunk 5


  return pitch_tuning(
  return pitch_tuning(
Processing chunks:  43%|████▎     | 9/21 [1:20:34<1:31:37, 458.09s/it]

Cache cleared after chunk 10


Processing chunks:  48%|████▊     | 10/21 [1:21:09<1:10:01, 381.93s/it]

Partial results saved after chunk 10


Processing chunks:  71%|███████▏  | 15/21 [1:27:58<17:09, 171.56s/it]

Cache cleared after chunk 15


Processing chunks:  90%|█████████ | 19/21 [1:48:23<07:47, 233.70s/it]

Cache cleared after chunk 20


Processing chunks: 100%|██████████| 21/21 [1:48:27<00:00, 309.86s/it]

Partial results saved after chunk 20





Selesai! Fitur telah disimpan ke 'audio_features_partial.csv'
Total waktu pemrosesan: 6508.62 detik
Jumlah total file yang diproses: 102126


In [1]:
import pandas as pd

df = pd.read_csv(r'/kaggle/working/audio_features_partial.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102126 entries, 0 to 102125
Data columns (total 28 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   chroma_stft         102126 non-null  float64
 1   rms                 102126 non-null  float64
 2   spectral_centroid   102126 non-null  float64
 3   spectral_bandwidth  102126 non-null  float64
 4   spectral_rolloff    102126 non-null  float64
 5   zero_crossing_rate  102126 non-null  float64
 6   mfcc_1              102126 non-null  float64
 7   mfcc_2              102126 non-null  float64
 8   mfcc_3              102126 non-null  float64
 9   mfcc_4              102126 non-null  float64
 10  mfcc_5              102126 non-null  float64
 11  mfcc_6              102126 non-null  float64
 12  mfcc_7              102126 non-null  float64
 13  mfcc_8              102126 non-null  float64
 14  mfcc_9              102126 non-null  float64
 15  mfcc_10             102126 non-nul

In [2]:
df['label'].value_counts()

label
Breton               3570
Swedish              3177
Hakha Chin           3012
Chinese Taiwan       3003
Latvian              2955
Tatar                2784
Kabyle               2766
Arabic               2751
Indonesian           2733
Slovenian            2691
Dutch                2550
Romanian             2547
Greek                2520
Interlingua          2484
Turkish              2472
Czech                2442
Tamil                2367
Kyrgyz               2331
Persian              2283
Maltese              2277
Japanese             2268
Portuguese           2238
Chuvash              2202
Polish               2196
Ukrainian            2181
Welsh                2073
Frisian              2073
Chinese Hongkong     2016
Romansh Sursilvan    1977
Dhivehi              1962
Kinyarwanda          1947
Esperanto            1932
Basque               1914
Georgian             1890
Mongolian            1884
French               1863
Catalan              1830
German               1830
Chines

In [21]:
df.head(5)

Unnamed: 0,chroma_stft,rms,spectral_centroid,spectral_bandwidth,spectral_rolloff,zero_crossing_rate,mfcc_1,mfcc_2,mfcc_3,mfcc_4,...,mfcc_14,mfcc_15,mfcc_16,mfcc_17,mfcc_18,mfcc_19,mfcc_20,label,file_path,numeric_labels
0,0.712224,0.05574,2989.050985,2193.800068,5666.294643,0.275144,-202.32579,27.614292,4.094784,5.301181,...,-0.985637,-6.752584,-4.679379,-5.478848,-0.866508,-1.919669,-0.634521,Arabic,/kaggle/input/preprocess-common-language/proce...,0
1,0.598403,0.074004,2372.315827,2065.56102,4795.64951,0.178041,-137.41476,59.931843,5.077963,-5.712012,...,-2.134825,-10.989368,-1.460541,-4.485021,-0.408789,-8.211143,-5.170048,Arabic,/kaggle/input/preprocess-common-language/proce...,0
2,0.756316,0.046051,3274.178654,2196.474265,5870.572917,0.353054,-233.02031,22.69155,10.057923,3.829097,...,-5.409642,-4.017134,-6.744406,-1.69763,-0.387302,0.829549,1.29211,Arabic,/kaggle/input/preprocess-common-language/proce...,0
3,0.588983,0.061376,1948.418292,2049.242741,4186.921296,0.137682,-199.0449,80.80687,31.45138,-1.297673,...,0.065943,-9.312079,-7.16406,-3.08204,-8.046175,-3.083879,-2.018449,Arabic,/kaggle/input/preprocess-common-language/proce...,0
4,0.647222,0.069638,1705.618989,1824.714129,3730.709877,0.09554,-325.47556,92.37382,17.725632,31.867613,...,4.538502,-1.073114,-1.204524,-0.108214,-4.80346,-2.882802,-1.455632,Arabic,/kaggle/input/preprocess-common-language/proce...,0
