In [1]:
import numpy as np
import pandas as pd
import librosa

# Load & Describe Dataset

In [2]:
voicePath = '../Dataset/voices'
transcriptsPath = '../Dataset/transcripts.csv'

In [3]:
transcripts = pd.read_csv(transcriptsPath)

In [4]:
transcripts.head()

Unnamed: 0,voice_filename,transcript,accent,gender,tone
0,voice_1.mp3,چرا این‌‌‌‌طور فکر می‌‌‌‌کنی؟,فارسی,male,question
1,voice_2.mp3,همیشه من و تو راجع به آن با هم صحبت کرده‌‌‌‌ایم,فارسی,male,normal
2,voice_3.mp3,دنیا در حال گذار به‌‌‌‌سمت پایداری است,فارسی,male,normal
3,voice_4.mp3,شاخصی که باید عملکرد تسلا را با آن اندازه بگیریم,فارسی,male,normal
4,voice_5.mp3,باید تعداد واقعاً غیرقابل‌‌‌‌تصوری باتری تولید...,فارسی,male,normal


In [5]:
transcripts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6042 entries, 0 to 6041
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   voice_filename  6042 non-null   object
 1   transcript      6042 non-null   object
 2   accent          6042 non-null   object
 3   gender          6042 non-null   object
 4   tone            6042 non-null   object
dtypes: object(5)
memory usage: 236.1+ KB


In [6]:
transcripts.describe()

Unnamed: 0,voice_filename,transcript,accent,gender,tone
count,6042,6042,6042,6042,6042
unique,6042,6034,7,5,19
top,voice_5386.mp3,سوم – سازمان های دولتی باید از پیشنهاد یا تاًی...,فارسی,male,normal
freq,1,2,5581,3899,4464


In [7]:
transcripts['gender'].value_counts()

male      3899
female    1282
male       702
مرد        119
Female      40
Name: gender, dtype: int64

In [8]:
transcripts['tone'].value_counts()

normal                 4464
question                529
imperative              299
exclamatory             289
incomplete              263
normal                   68
Normal                   40
nomal                    21
exclamative              16
incomplete               11
quenstion                 9
impreative                9
question                  6
incomplet                 5
Question                  4
exclamatory               4
imperative                3
question/incomplete       1
nortmal                   1
Name: tone, dtype: int64

In [9]:
transcripts['accent'].value_counts()

فارسی      5581
فارسی       247
ترکی         83
farsi        64
خراسانی      40
شیرازی       19
یزدی          8
Name: accent, dtype: int64

## Preprocessing on columns value

In [10]:
columns_to_clean = ['accent', 'tone', 'gender']

replacement_mapping = {'nomal': 'normal', 'nortmal': 'normal', 
                       'quenstion': 'question',
                        'impreative': 'imperative', 
                        'incomplet': 'incomplete', 
                        'exclamative': 'exclamatory',
                        'مرد': 'male',
                        'farsi': 'فارسی',
                        'question/incomplete': 'question'}

# Apply the transformations
for col in columns_to_clean:
    transcripts[col] = transcripts[col].str.strip().str.lower()
    transcripts[col] = transcripts[col].replace(replacement_mapping)

In [11]:
transcripts['gender'].value_counts()

male      4720
female    1322
Name: gender, dtype: int64

In [12]:
transcripts['tone'].value_counts()

normal         4594
question        549
imperative      311
exclamatory     309
incomplete      279
Name: tone, dtype: int64

In [13]:
transcripts['accent'].value_counts()

فارسی      5892
ترکی         83
خراسانی      40
شیرازی       19
یزدی          8
Name: accent, dtype: int64

## Preprocessing on Audio

In [14]:
def audioPreprocessing(input_path, target_sr):

    y, sr = librosa.load(input_path)

    # Resample to a specific sample rate
    y_resampled = librosa.resample(y, orig_sr=sr, target_sr=target_sr)

    # Trim Silence
    y_trimed, index = librosa.effects.trim(y_resampled)

    # Normalize Audio Levels
    y_normalized = librosa.util.normalize(y_trimed)

    # Remove Background Noise
    y_denoised = librosa.effects.preemphasis(y_normalized)

    return y_denoised

In [15]:
def audioFeatureExtraction(input_path, target_sr):
    # Preprocess audio
    y_clean = audioPreprocessing(input_path, target_sr)

    # Extract features
    mfccs = librosa.feature.mfcc(y=y_clean, sr=target_sr, n_mfcc=13)
    chroma = librosa.feature.chroma_stft(y=y_clean, sr=target_sr)
    contrast = librosa.feature.spectral_contrast(y=y_clean, sr=target_sr)
    centroid = librosa.feature.spectral_centroid(y=y_clean, sr=target_sr)
    bandwidth = librosa.feature.spectral_bandwidth(y=y_clean, sr=target_sr)
    zero_crossings = librosa.feature.zero_crossing_rate(y=y_clean)
    
    # Additional features...
    
    features_dict = {
        'mfcc_mean': mfccs.mean(axis=1),
        'chroma_mean': chroma.mean(axis=1),
        'contrast_mean': contrast.mean(axis=1),
        'centroid_mean': centroid.mean(),
        'bandwidth_mean': bandwidth.mean(),
        'zero_crossings_mean': zero_crossings.mean(),
        # Add more features...
    }

    return features_dict

In [16]:
target_sr = 16000
features_df  = transcripts['voice_filename'].apply(lambda x: audioFeatureExtraction(voicePath + '/' + x, target_sr))
features_dataframe = pd.DataFrame(features_df.tolist())

transcripts_features = pd.concat([transcripts, features_dataframe], axis=1)

In [17]:
transcripts_features.describe()

Unnamed: 0,centroid_mean,bandwidth_mean,zero_crossings_mean
count,6042.0,6042.0,6042.0
mean,2946.353858,1830.851651,0.292715
std,514.138135,238.57483,0.06407
min,1211.38399,964.516766,0.111397
25%,2614.443391,1698.685016,0.249784
50%,2948.09708,1883.215322,0.283855
75%,3297.615054,2000.500074,0.329858
max,4487.562284,2461.147785,0.554474


In [20]:
transcripts_features.head()

Unnamed: 0,voice_filename,transcript,accent,gender,tone,mfcc_mean,chroma_mean,contrast_mean,centroid_mean,bandwidth_mean,zero_crossings_mean
0,voice_1.mp3,چرا این‌‌‌‌طور فکر می‌‌‌‌کنی؟,فارسی,male,question,"[-226.45906, 27.234396, -4.854726, 13.827248, ...","[0.631335, 0.5893609, 0.59790057, 0.6132906, 0...","[27.924137314442838, 12.368583018695546, 15.13...",3002.330924,2087.859977,0.304001
1,voice_2.mp3,همیشه من و تو راجع به آن با هم صحبت کرده‌‌‌‌ایم,فارسی,male,normal,"[-341.25583, -10.133878, 5.558397, 17.953564, ...","[0.38676995, 0.33763474, 0.3399514, 0.38050535...","[27.67936515878549, 14.142937557692985, 17.848...",3716.514567,2131.470004,0.415193
2,voice_3.mp3,دنیا در حال گذار به‌‌‌‌سمت پایداری است,فارسی,male,normal,"[-308.26108, -7.790743, 6.654821, 9.634984, 7....","[0.40178165, 0.3928808, 0.37732053, 0.48148748...","[27.216098722816948, 13.855844943641957, 18.61...",3715.257064,2089.085343,0.427597
3,voice_4.mp3,شاخصی که باید عملکرد تسلا را با آن اندازه بگیریم,فارسی,male,normal,"[-324.18372, -12.081556, 0.6377594, 17.15852, ...","[0.4548103, 0.33421117, 0.3522711, 0.37673205,...","[26.48621315711807, 14.083842997304714, 18.324...",3742.295518,1994.383106,0.423299
4,voice_5.mp3,باید تعداد واقعاً غیرقابل‌‌‌‌تصوری باتری تولید...,فارسی,male,normal,"[-273.72803, 4.334649, -1.7544707, 18.322426, ...","[0.4542341, 0.41574174, 0.44602737, 0.4391814,...","[27.709682279899432, 13.965553509440323, 18.92...",3451.477394,2081.125187,0.387289


In [None]:
transcripts_features.to_csv('../Dataset/transcripts_features.csv')

In [18]:
# for voiceName in transcripts['voice_filename']:
#     try:
#         y, sr = librosa.load(voicePath+'/'+voiceName, sr=None)
#     except:
#         print(voiceName)