In [6]:
import os
import numpy as np
import pandas as pd
import librosa
from scipy.io import wavfile

# Define paths
train_audio_path = "C:/Users/king/Downloads/MTC-ASR-Dataset-16K/train/"
adapt_audio_path = 'C:/Users/king/Downloads/MTC-ASR-Dataset-16K/adapt/'
train_csv_path = 'C:/Users/king/Downloads/MTC-ASR-Dataset-16K/train.csv'
adapt_csv_path = 'C:/Users/king/Downloads/MTC-ASR-Dataset-16K/adapt.csv'
output_path = 'C:/Users/king/Downloads/MTC-ASR-Dataset-16K/preprocessed_data/'

# Ensure output path exists
os.makedirs(output_path, exist_ok=True)



In [7]:
train_df = pd.read_csv(train_csv_path)
adapt_df = pd.read_csv(adapt_csv_path)

print("Train CSV Columns: ", train_df.columns)
print("Adapt CSV Columns: ", adapt_df.columns)

# Display the first few rows of each CSV file
print(train_df.head())
print(adapt_df.head())

Train CSV Columns:  Index(['audio', 'transcript'], dtype='object')
Adapt CSV Columns:  Index(['audio', 'transcript'], dtype='object')
            audio                                         transcript
0  train_sample_0  على إنها عار في الوقت اللي كانت بتتعامل مع أخو...
1  train_sample_1  فأكيد ربنا عوضهم خير هو الراجل بيبقى ليه إختيا...
2  train_sample_2  زي دول كتيره بنشوفها النهارده في العالم وأصبحت...
3  train_sample_3  يعني مين اللي بيحط شروطها يعني أنا شايفه إني م...
4  train_sample_4  والله هي الموضوع مش كليب خلي بالك ولا أغنيه ال...
                  audio                                         transcript
0  adapt_sample_0_clean                            شوفلنا المشوار ده يا حج
1  adapt_sample_1_clean  لأ للأسف دكتوره واحده بس بتعمل العمليه ديت عند...
2  adapt_sample_2_clean                        والراجل تبصله يعني إبن زمنه
3  adapt_sample_3_clean                       و أنت كيف عرفته أبترل يا عمي
4  adapt_sample_4_clean               ميعرفوش حاجه عن السوبر أه غير إنه لب


In [8]:
# Data augmentation functions
def add_noise(data, noise_factor=0.005):
    noise = np.random.randn(len(data))
    augmented_data = data + noise_factor * noise
    return augmented_data

def shift(data, shift_max=2, shift_direction='both'):
    shift = np.random.randint(shift_max)
    if shift_direction == 'right':
        shift = -shift
    elif shift_direction == 'both':
        direction = np.random.randint(0, 2)
        if direction == 1:
            shift = -shift
    augmented_data = np.roll(data, shift)
    return augmented_data

def stretch(data, rate=1.0):
    return librosa.effects.time_stretch(y=data, rate=rate)


In [9]:
import numpy as np
import librosa

# Feature extraction function
def extract_features(file_path, target_sr=16000, n_mfcc=13):
    try:
        audio, sr = librosa.load(file_path, sr=target_sr)
        
        # Data Augmentation
        audio = add_noise(audio)
        audio = shift(audio)
        audio = stretch(audio, rate=np.random.uniform(0.8, 1.2))

        # Adjust n_fft if audio length is shorter
        n_fft = min(len(audio), 512)  # Use 512 or the length of the audio, whichever is smaller

        # Feature extraction
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft)
        chroma = librosa.feature.chroma_stft(y=audio, sr=sr, n_fft=n_fft)
        mel = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=n_fft)
        contrast = librosa.feature.spectral_contrast(y=audio, sr=sr, n_fft=n_fft)
        tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(audio), sr=sr)
        
        # Combine features
        features = np.concatenate((
            np.mean(mfccs.T, axis=0),
            np.mean(chroma.T, axis=0),
            np.mean(mel.T, axis=0),
            np.mean(contrast.T, axis=0),
            np.mean(tonnetz.T, axis=0)
        ))
        
        return features, None  # Return features and no error
    except Exception as e:
        return None, str(e)  # Return None and error message


In [10]:
from tqdm import tqdm
def preprocess_and_save_data(csv_path, audio_path, output_file):
    df = pd.read_csv(csv_path)
    print(f"First 5 rows of {csv_path}:")
    print(df.head())

    features_list = []
    labels = []
    errors = []

    for idx, row in tqdm(df.iterrows(), total=len(df), desc=f"Processing {csv_path}"):
        try:
            file_path = os.path.join(audio_path, f"{row['audio']}.wav")
            features, error = extract_features(file_path)
            if features is not None:
                features_list.append(features)
                labels.append(row['transcript'])
                
            else:
                errors.append(f"Error processing {file_path}: {error}")
        except Exception as e:
            errors.append(f"Exception processing {file_path}: {str(e)}")

    features_array = np.array(features_list)
    labels_array = np.array(labels)

    print(f"Final features array shape: {features_array.shape}")
    print(f"Final labels array shape: {labels_array.shape}")
    print(f"Number of errors: {len(errors)}")

    np.savez_compressed(output_file, features=features_array, labels=labels_array, errors=errors)
    print(f"Saved preprocessed data to {output_file}")

# Preprocess train and adapt datasets
preprocess_and_save_data(train_csv_path, train_audio_path, os.path.join(output_path, 'train_data.npz'))
preprocess_and_save_data(adapt_csv_path, adapt_audio_path, os.path.join(output_path, 'adapt_data.npz'))

First 5 rows of C:/Users/king/Downloads/MTC-ASR-Dataset-16K/train.csv:
            audio                                         transcript
0  train_sample_0  على إنها عار في الوقت اللي كانت بتتعامل مع أخو...
1  train_sample_1  فأكيد ربنا عوضهم خير هو الراجل بيبقى ليه إختيا...
2  train_sample_2  زي دول كتيره بنشوفها النهارده في العالم وأصبحت...
3  train_sample_3  يعني مين اللي بيحط شروطها يعني أنا شايفه إني م...
4  train_sample_4  والله هي الموضوع مش كليب خلي بالك ولا أغنيه ال...


Processing C:/Users/king/Downloads/MTC-ASR-Dataset-16K/train.csv: 100%|██████████| 50715/50715 [5:34:25<00:00,  2.53it/s]  


Final features array shape: (50715, 166)
Final labels array shape: (50715,)
Number of errors: 0
Saved preprocessed data to C:/Users/king/Downloads/MTC-ASR-Dataset-16K/preprocessed_data/train_data.npz
First 5 rows of C:/Users/king/Downloads/MTC-ASR-Dataset-16K/adapt.csv:
                  audio                                         transcript
0  adapt_sample_0_clean                            شوفلنا المشوار ده يا حج
1  adapt_sample_1_clean  لأ للأسف دكتوره واحده بس بتعمل العمليه ديت عند...
2  adapt_sample_2_clean                        والراجل تبصله يعني إبن زمنه
3  adapt_sample_3_clean                       و أنت كيف عرفته أبترل يا عمي
4  adapt_sample_4_clean               ميعرفوش حاجه عن السوبر أه غير إنه لب


Processing C:/Users/king/Downloads/MTC-ASR-Dataset-16K/adapt.csv: 100%|██████████| 2199/2199 [11:02<00:00,  3.32it/s]

Final features array shape: (2199, 166)
Final labels array shape: (2199,)
Number of errors: 0
Saved preprocessed data to C:/Users/king/Downloads/MTC-ASR-Dataset-16K/preprocessed_data/adapt_data.npz





In [11]:
import numpy as np
# Load the preprocessed data
train_data = np.load("C:/Users/king/Downloads/MTC-ASR-Dataset-16K/preprocessed_data/train_data.npz")
train_features = train_data['features']
train_labels = train_data['labels']

adapt_data = np.load("C:/Users/king/Downloads/MTC-ASR-Dataset-16K/preprocessed_data/adapt_data.npz")
adapt_features = adapt_data['features']
adapt_labels = adapt_data['labels']

In [12]:
print("Train data shape:", train_features.shape)
print("Adaptation data shape:", adapt_features.shape)

Train data shape: (50715, 166)
Adaptation data shape: (2199, 166)


In [13]:
print(train_data.files)
print(train_data['features'].shape)
print(train_data['labels'].shape)


['features', 'labels', 'errors']
(50715, 166)
(50715,)
