In [1]:
from python_speech_features import mfcc
import pandas as pd
import numpy as np
import pydub
import librosa
import pickle
import re
import os
import warnings
warnings.filterwarnings('ignore')

## Helper Functions

In [2]:
def Zone(name):
    """
    create a list of [file name, day, and time]
    for all files in each zone folder if the file is in 'mp3'
    format 
    """
    lst = []
    for file in name:
        if ".mp3" in file:
            lst.append([file, 
                          int(file[6:8]),
                          int(file[8:12].lstrip('0'))]) 
    return lst

In [3]:
def file_name(x, zone):
    """
    return the mp3 file name in each zone folder based on day and time
    """
    for feature in zone:
        if (feature[1] == x.day) & (feature[2] == x.time):
            return feature[0]

In [4]:
def mp3_path(x):
    """
    return the mp3 file name based on zone number 
    """
    if x.zone == 'Zone1':
        return file_name(x,Zone1)
    elif x.zone == 'Zone4':
        return file_name(x,Zone4)
    elif x.zone == 'Zone8':
        return file_name(x,Zone8)
    else:
        return file_name(x,Zone13)

In [5]:
def export(df, name):
    output = {}
    df['index_'] = df.index
    for index, row in df.iterrows():
        output[row.index_] = row.wav_rate
    
    with open(name, "wb") as fil:
        pickle.dump(output, fil, pickle.HIGHEST_PROTOCOL)

In [6]:
def load(name):
    file = open(name,'rb')
    input_ = pickle.load(file)
    file.close()
    
    zone = {}
    for idx,key in enumerate(input_):
        zone[key] = input_[key]
    
    return zone 

In [7]:
def pydub_to_np(audio):
    return np.array(audio.get_array_of_samples(), dtype=np.float64).reshape((-1, audio.channels)).T / (1<<(8*audio.sample_width)), audio.frame_rate

In [8]:
def get_mfcc(wav_rate):
    """
    return the mfcc features  
    """
    wav, sample_rate = wav_rate
    features = librosa.feature.mfcc(wav, sr=sample_rate,n_mfcc=40)
    return features 

## Categories:
#### Highest quality - no X and no []
#### High quality - no X but has []
#### Medium quality - some Xs but no [ ]
#### Low quality - some X and has [ ]
#### Lowest Quality => Contains only Xs

In [11]:
df = pd.read_csv("transcripts_noid2022_02_06.csv")

In [12]:
len(df)

61847

In [13]:
df.zone.unique()

array(['Zone1', 'Zone4', 'Zone8', 'Zone13'], dtype=object)

In [14]:
df['char_len'] = df.transcription.apply(lambda x: len(str(x))) #count the number of words in transcripts 

In [15]:
highest = df[(df.transcription.str.contains("<X>") == False)&(df.transcription.str.contains(r'\[\w+\]') == False)]
highest['label'] = 'highest'

In [16]:
high = df[(df.transcription.str.contains("<X>") == False)&(df.transcription.str.contains(r'\[\w+\]') == True)]
high['label'] = 'high'

In [17]:
sub = df[(df.transcription.str.contains("<X>") == True)&(df.transcription.str.contains(r'\[\w+\]') == False)]
medium = sub[sub['char_len'] > 5]
medium['label'] = 'medium'

In [18]:
lowest = sub[sub['char_len'] <= 5]
lowest['label'] = 'lowest'

In [19]:
low = df[(df.transcription.str.contains("<X>") == True)&(df.transcription.str.contains(r'\[\w+\]') == True)]
low['label'] = 'low'

In [23]:
#get all the files in each zone folder 
Zone1_arr = os.listdir("Zone1/")
Zone4_arr = os.listdir("Zone4/")
Zone8_arr = os.listdir("Zone8/")
Zone13_arr = os.listdir("Zone13/")

In [24]:
Zone1 = Zone(Zone1_arr)
Zone4 = Zone(Zone4_arr)
Zone8 = Zone(Zone8_arr)
Zone13 = Zone(Zone13_arr)

## Create target variables for model training
class 1 -> highest quality 

class 2 -> medium + low + lowest 

## Class 1

In [25]:
highest['mp3'] = None
highest['mp3'] = highest.apply(lambda x: mp3_path(x), axis=1)

In [282]:
zone1 = highest[highest.zone == 'Zone1']
zone1['wav_rate'] = zone1.apply(lambda x: 
                                pydub_to_np(pydub.AudioSegment.from_mp3('Zone1/' + x.mp3)[(x.start * 1000):(x.end * 1000)]), 
                                axis = 1)
export(zone1, f"class1_wav_1.pkl")

In [281]:
zone4 = highest[highest.zone == 'Zone4']
zone4['wav_rate'] = zone4.apply(lambda x: 
                                pydub_to_np(pydub.AudioSegment.from_mp3('Zone4/' + x.mp3)[(x.start * 1000):(x.end * 1000)]), 
                                axis = 1)
export(zone4, f"class1_wav_4.pkl")

In [280]:
zone8 = highest[highest.zone == 'Zone8']
zone8['wav_rate'] = zone8.apply(lambda x: 
                                pydub_to_np(pydub.AudioSegment.from_mp3('Zone8/' + x.mp3)[(x.start * 1000):(x.end * 1000)]), 
                                axis = 1)
export(zone8, f"class1_wav_8.pkl")

In [283]:
zone13 = highest[highest.zone == 'Zone13']
zone13['wav_rate'] = zone13.apply(lambda x: 
                                pydub_to_np(pydub.AudioSegment.from_mp3('Zone13/' + x.mp3)[(x.start * 1000):(x.end * 1000)]), 
                                axis = 1)
export(zone8, f"class1_wav_13.pkl")

In [22]:
zone1 = load("class1_wav_1.pkl")
zone1 = pd.DataFrame.from_dict(zone1.items())
zone1.columns = ['id', 'wav_rate']
zone4 = load("class1_wav_4.pkl")
zone4 = pd.DataFrame.from_dict(zone4.items())
zone4.columns = ['id', 'wav_rate']
zone8 = load("class1_wav_8.pkl")
zone8 = pd.DataFrame.from_dict(zone8.items())
zone8.columns = ['id', 'wav_rate']
zone13 = load("class1_wav_13.pkl")
zone13 = pd.DataFrame.from_dict(zone13.items())
zone13.columns = ['id', 'wav_rate']

In [6]:
zone = pd.concat([zone1,zone4,zone8,zone13], axis = 0).reset_index(drop=True)
zone['MFCC'] = zone.wav_rate.apply(lambda x: get_mfcc(x))

In [8]:
mfcc = {}
for index in range(len(zone)):
    each = {}
    for key,value in enumerate(zone['MFCC'].values[index][0]):
        each[key] = value
        
    mfcc[index] = each 

#split the 40 mfcc features into 40 columns in dataframe 
class1 = pd.DataFrame.from_dict(mfcc.values())

In [10]:
##create a dataframe (x 120 columns) including metrics like 
##mean, median, and standard deviation  
class1_mean = class1.applymap(np.mean).add_suffix("_mean") 
class1_median = class1.applymap(np.median).add_suffix("_median")
class1_std = class1.applymap(np.std).add_suffix("_std")
class1 = pd.concat([class1_mean,class1_median,class1_std], axis = 1)
class1['target'] = 0 #1 indicates low quality, 0 indicates high quality  

## Class 2

In [268]:
subdf = pd.concat([medium,low,lowest]) 
subdf['mp3'] = None
subdf['mp3'] = subdf.apply(lambda x: mp3_path(x), axis=1)

In [267]:
#get sound array, sample rate of files in Zone 1 
zone1 = subdf[subdf.zone == 'Zone1']
zone1['wav_rate'] = zone1.apply(lambda x: 
                                pydub_to_np(pydub.AudioSegment.from_mp3('Zone1/' + x.mp3)[(x.start * 1000):(x.end * 1000)]), 
                                axis = 1)
export(zone1, f"class2_wav_1.pkl")

In [260]:
zone4 = subdf[subdf.zone == 'Zone4']
zone4['wav_rate'] = zone4.apply(lambda x: 
                                pydub_to_np(pydub.AudioSegment.from_mp3('Zone4/' + x.mp3)[(x.start * 1000):(x.end * 1000)]), 
                                axis = 1)
export(zone4, f"class2_wav_4.pkl")

In [262]:
zone8 = subdf[subdf.zone == 'Zone8']
zone8['wav_rate'] = zone8.apply(lambda x: 
                                pydub_to_np(pydub.AudioSegment.from_mp3('Zone8/' + x.mp3)[(x.start * 1000):(x.end * 1000)]), 
                                axis = 1)
export(zone8, f"class2_wav_8.pkl")

In [264]:
zone13 = subdf[subdf.zone == 'Zone13']
zone13['wav_rate'] = zone13.apply(lambda x: 
                                pydub_to_np(pydub.AudioSegment.from_mp3('Zone13/' + x.mp3)[(x.start * 1000):(x.end * 1000)]), 
                                axis = 1)
export(zone13, f"class2_wav_13.pkl")

In [17]:
zone1 = load("class2_wav_1.pkl")
zone1 = pd.DataFrame.from_dict(zone1.items())
zone1.columns = ['id', 'wav_rate']
zone4 = load("class2_wav_4.pkl")
zone4 = pd.DataFrame.from_dict(zone4.items())
zone4.columns = ['id', 'wav_rate']
zone8 = load("class2_wav_8.pkl")
zone8 = pd.DataFrame.from_dict(zone8.items())
zone8.columns = ['id', 'wav_rate']
zone13 = load("class2_wav_13.pkl")
zone13 = pd.DataFrame.from_dict(zone13.items())
zone13.columns = ['id', 'wav_rate']
zone = pd.concat([zone1,zone4,zone8,zone13], axis = 0).reset_index(drop=True)
zone['MFCC'] = zone.wav_rate.apply(lambda x: get_mfcc(x))

In [18]:
mfcc = {}
for index in range(len(zone)):
    each = {}
    for key,value in enumerate(zone['MFCC'].values[index][0]):
        each[key] = value
        
    mfcc[index] = each 

#split the 40 mfcc features into 40 columns in dataframe 
class2 = pd.DataFrame.from_dict(mfcc.values()) 

In [20]:
##create a dataframe (x 120 columns) including metrics like 
##mean, median, and standard deviation  
class2_mean = class2.applymap(np.mean).add_suffix("_mean") 
class2_median = class2.applymap(np.median).add_suffix("_median")
class2_std = class2.applymap(np.std).add_suffix("_std")
class2 = pd.concat([class2_mean,class2_median,class2_std], axis = 1)
class2['target'] = 1 #1 indicates low quality, 0 indicates high quality 