In [None]:
import pandas as pd
import numpy as np
import csv
import os
from PIL import Image
import IPython.display as ipd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import librosa
import librosa.display
from scipy import signal
from scipy.io import wavfile
from tqdm import tqdm
import pickle
from pydub import AudioSegment


In [None]:

"""  For listing all the audio files in a directory (ignoring hidden files)""""
def get_filelist(path):
    data_files = os.listdir(path)
    files=[]
    for names in data_files:
        if names[0]!="." and names[-1]=='v':
            files.append(names)
            
    return files

## Segmentation

In [None]:
"""  For clipping audio into 1 sec segments """"

def segment(path):
    files = get_filelist(path+'wav/')
    os.system('mkdir '+path+'segments')
    for file in tqdm(files):
        os.system('mkdir -p '+path+'segments/'+file[:-4])
        os.system('ffmpeg -i '+path+'wav/'+file+' -f segment -segment_time '+str(1)+' -c copy '+path+'segments/'+file[:-4]+'/%03d.wav')
for path in paths:
    segment(path)

100%|██████████| 260/260 [00:26<00:00,  9.91it/s]
100%|██████████| 260/260 [00:27<00:00,  9.47it/s]



## padding

In [None]:
"""  For padding audio segments to make all of them of same size """"
def padding(path):
    files = get_filelist(path+'wav/')
#     print(files)
    for file in tqdm(files):
        for pp in os.listdir(path+'segments/'+file[:-4]):
            filename=path+'segments/'+file[:-4]+'/'+pp
            audio = AudioSegment.from_wav(filename)
            pad_ms = 998 
            if len(audio) < pad_ms:

                silence = AudioSegment.silent(duration=(pad_ms-len(audio)))
                padded = audio + silence  # Adding silence after the audio
                padded.export(filename, format='wav')
                del padded,silence
for path in paths:
    padding(path)

100%|██████████| 260/260 [00:00<00:00, 672.66it/s]
100%|██████████| 260/260 [00:01<00:00, 257.14it/s]


## Audio Feature Extraction

In [None]:
def audio_feature_extractor(path,audio):
    files = get_filelist(path+'wav/')
    
    for file in tqdm(files):
        minus=0
        vector=np.zeros(291)
        segments= get_filelist(path+'segments/'+file[:-4])
        for pp in segments:
            filename=path+'segments/'+file[:-4]+'/'+pp
            sr=22050
            clip, sr = librosa.load(filename, sr=sr)
            # removing background noise
            S_full, phase = librosa.magphase(librosa.stft(clip))
            S_full = np.hstack([S_full,np.zeros((1025,1))])
            S_filter = librosa.decompose.nn_filter(S_full,
                                               aggregate=np.median,
                                               metric='cosine')
            S_filter = np.minimum(S_full, S_filter)

            margin_i, margin_v = 2, 10
            power = 1

            mask_v = librosa.util.softmask(S_full - S_filter,margin_v * S_filter,power=power)

            S_foreground = mask_v * S_full
            foreground_audio = librosa.istft(S_foreground)
            #spectrogram
            S = librosa.feature.melspectrogram(y=foreground_audio, sr=sr)
            #mfcc
            m = librosa.feature.mfcc(S=librosa.power_to_db(S), sr=sr,n_mfcc=128,norm='ortho')
            #prosodic
            os. system('cd && cd SER && ./opensmile-2.3.0/SMILExtract -C /home/development/shubhamm/SER/opensmile-2.3.0/config/prosodyShsViterbiLoudness.conf -I '+filename+' -csvoutput /home/development/shubhamm/shubham.csv -instname '+file[:-4]+pp)
            p = pd.read_csv('/home/development/shubhamm/shubham.csv',sep=';').drop(columns=['name','frameTime']).to_numpy()
            os.system('rm /home/development/shubhamm/shubham.csv')

            vector+=np.hstack([S.mean(1).flatten(),m.mean(1).flatten(),p.flatten()])
            del S,sr,clip,S_full,phase,S_filter,S_foreground,margin_v,margin_i,mask_v
          
        #mean
        audio[file[:-4]] = vector/len(segments)
        del vector,segments
    return audio

In [None]:
audio_features={}
for path in paths:
    audio_features=audio_feature_extractor(path,audio_features)
    with open('/home/development/shubhamm/MTP/mustard_PP_non_audio_features.pickle', 'wb') as f:
        pickle.dump(audio_features, f, pickle.HIGHEST_PROTOCOL)


100%|██████████| 260/260 [04:58<00:00,  1.15s/it]
100%|██████████| 260/260 [15:51<00:00,  3.66s/it]


#Rough

In [None]:
# # b=pd.read_csv('utterance_text.csv')
# # utter=list(b['Utterance'])
# a=pd.read_csv('MUStARD-Final.csv')
# a.dropna(inplace=True)
# c_or_u=[]
# count=a["KEY"].value_counts().to_dict()
# for i in set(a['KEY']):
#     for j in range(count[i]-1):
#         c_or_u.append('C')
#     c_or_u.append('U')
# a['TYPE']=c_or_u
# print(a['TYPE'].value_counts())
# # a[['KEY','TYPE']]

In [None]:
# utter_path = "utterances_final/"
# context_path = "context_final/"
# utter_path_2="utterance_nolaughter/"

# path='/home/development/shubhamm/SER/Mustard/'

# s1_files = get_filelist(path+utter_path_2)
# s2_files = get_filelist(path+'Mustard_audio_only/'+context_path)

# a=pd.read_csv('MUStARD-Final.csv')
# a.dropna(inplace=True)
# c_or_u=[]
# count=a["KEY"].value_counts().to_dict()
# for i in set(a['KEY']):
#     for j in range(count[i]-1):
#         c_or_u.append('C')
#     c_or_u.append('U')
# a['TYPE']=c_or_u


# len(s1_files),len(s2_files)
path='/home/development/shubhamm/SER/Mustard/MELD/MELD/additions/additions_context_wav/'

# s1_files = get_filelist('/home/development/shubhamm/SER/Mustard/'+utter_path_2)
files = get_filelist(path)
len(files)

264

In [None]:
# files


In [None]:
def extract_prosodic(directory,list_dir):
    open_smile_path='/home/development/shubhamm/SER/'
    for entry in tqdm(range(len(list_dir))):    
        os. system('cd && cd SER && ./opensmile-2.3.0/SMILExtract -C '+open_smile_path+'opensmile-2.3.0/config/prosodyShsViterbiLoudness.conf -I '+directory+list_dir[entry]+'.wav -csvoutput '+path+'prosodic/'+list_dir[entry]+'.csv -instname '+list_dir[entry])


In [None]:
extract_prosodic(path,files)
# extract_prosodic(context_path,s2_files)
# os. system('clear')
# os. system('cd .. && ls')
# os. system('ls')

100%|██████████| 690/690 [11:41<00:00,  1.02s/it]


In [None]:
# Getting all the arff files from the current directory
files = [arff for arff in os.listdir('prosodic/') if arff.endswith(".arff")]

# Function for converting arff list to csv list
def toCsv(content): 
    data = False
    header = ""
    newContent = []
    for line in content:
        if not data:
            if "@attribute" in line:
                attri = line.split()
                columnName = attri[attri.index("@attribute")+1]
                header = header + columnName + ","
            elif "@data" in line:
                data = True
                header = header[:-1]
                header += '\n'
                newContent.append(header)
        else:
            newContent.append(line)
    return newContent

# Main loop for reading and writing files
for zzzz,file in tqdm(enumerate(files)):
    with open('prosodic/'+file , "r") as inFile:
        content = inFile.readlines()
        name,ext = os.path.splitext(inFile.name)
        new = toCsv(content)
        with open(name+".csv", "w") as outFile:
            outFile.writelines(new)

In [None]:
files = [csv for csv in os.listdir('prosodic/') if csv.endswith(".csv")]
len(files)

In [None]:
files = [arff for arff in os.listdir('small_prosodic/') if arff.endswith(".arff")]
len(files)

0

In [None]:
small_pros_dict={}
for i in tqdm(files):
    small_pros_dict[i[:-4]]=pd.read_csv('prosodic/'+files[0],sep=';').drop(columns=['name','frameTime']).to_numpy()
    

In [None]:
pd.read_csv(path+'prosodic/1_10004_c.csv',sep=';')#.drop(columns=['name','frameTime']).to_numpy()


Unnamed: 0,name,frameTime,F0final_sma_stddev,F0final_sma_amean,F0final_sma_linregc1,F0final_sma_centroid,F0final_sma_percentile10.0,F0final_sma_percentile90.0,F0final_sma_pctlrange0-1,F0finalLog_sma_stddev,...,HarmonicsToNoiseRatioACFLogdB_sma_percentile10.0,HarmonicsToNoiseRatioACFLogdB_sma_percentile90.0,HarmonicsToNoiseRatioACFLogdB_sma_pctlrange0-1,loudness_sma_stddev,loudness_sma_amean,loudness_sma_linregc1,loudness_sma_centroid,loudness_sma_percentile10.0,loudness_sma_percentile90.0,loudness_sma_pctlrange0-1
0,'1_10004_c',0.0,70.14959,177.7012,-0.106867,0.462534,116.7205,242.188,125.4675,5.781758,...,-3.299231,8.74311,12.04234,0.75762,0.739335,0.000166,0.529522,0.127184,1.915603,1.788419


In [None]:
pd.read_csv(path+'mfcc/1_10004_c.csv',sep=';').drop(columns=['frameIndex','frameTime'])


Unnamed: 0,pcm_fftMag_mfcc[1],pcm_fftMag_mfcc[2],pcm_fftMag_mfcc[3],pcm_fftMag_mfcc[4],pcm_fftMag_mfcc[5],pcm_fftMag_mfcc[6],pcm_fftMag_mfcc[7],pcm_fftMag_mfcc[8],pcm_fftMag_mfcc[9],pcm_fftMag_mfcc[10],...,pcm_fftMag_mfcc_de_de[4],pcm_fftMag_mfcc_de_de[5],pcm_fftMag_mfcc_de_de[6],pcm_fftMag_mfcc_de_de[7],pcm_fftMag_mfcc_de_de[8],pcm_fftMag_mfcc_de_de[9],pcm_fftMag_mfcc_de_de[10],pcm_fftMag_mfcc_de_de[11],pcm_fftMag_mfcc_de_de[12],pcm_LOGenergy_de_de
0,-8.011848,-1.449537,9.305407,5.850235,10.602720,-4.889924,-7.826927,2.134107,5.340427,1.322865,...,-0.168573,-0.169165,0.410604,1.266010,-0.863432,-0.568962,-0.243041,0.505993,0.974722,-0.010129
1,-8.112675,-3.031580,8.491982,8.942196,11.417730,-1.514646,-4.485522,21.598760,14.122560,3.164012,...,-0.132601,-0.186677,0.958386,1.556560,-2.145867,-0.723303,0.246346,0.673896,1.209774,-0.002203
2,-7.629745,-1.936231,1.781729,6.014417,16.131340,-14.885370,-3.343812,14.144940,10.724350,1.576625,...,-0.175129,-0.275914,1.825038,0.360848,-2.782104,-0.293552,0.605440,-0.119224,0.366426,0.013846
3,-9.654575,-2.752824,3.284466,5.728155,9.176538,-15.528140,-1.671631,7.567946,3.983313,-5.377982,...,-0.282833,-0.066017,1.150807,-1.624789,-2.219182,0.025718,0.659019,-1.419318,-1.094173,0.033934
4,-8.416073,0.665511,4.999913,5.828171,15.729310,7.738194,27.448680,14.458830,10.181080,4.566329,...,-0.510999,-0.425739,-1.046760,-3.301112,-0.596322,-0.041356,-0.359432,-2.161040,-1.224635,0.046806
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1592,-19.917360,13.014040,-8.102421,6.420061,14.038600,-9.414040,0.525273,3.763247,7.064021,8.775291,...,-0.361436,-0.237419,1.157489,-0.038683,-1.812048,0.025067,-1.407517,0.230051,1.914401,-0.018352
1593,-24.235880,8.361012,-6.801124,13.449990,10.483570,-14.176460,3.196768,6.512711,13.489370,13.359440,...,-1.342942,-0.191297,1.940438,0.206924,-1.307752,-0.669350,-1.409484,-0.274665,1.819853,-0.084709
1594,-28.661170,3.928953,-15.463370,8.810358,20.950600,-10.487680,3.047817,-0.599194,0.565223,-1.148873,...,-1.313435,-0.665973,1.943712,0.565858,0.060547,-0.453732,-0.065709,-0.280225,-0.053695,-0.121552
1595,-27.974280,0.410627,-22.709690,4.849070,22.738480,-10.819020,3.805053,-8.984485,4.243677,-6.001206,...,-0.367119,-0.919875,1.049714,0.435541,1.276639,-0.019879,0.917061,0.327728,-1.192540,-0.069243


In [None]:
pickle.dump(small_pros_dict,open('pros_dict.pkl','wb'))

## mfcccccccccccccccccc

In [None]:
# os. system('cd .. && ./opensmile-2.3.0/SMILExtract -C '+open_smile_path+'opensmile-2.3.0/config/prosodyShsViterbiLoudness.conf -I '+path+directory+list_dir[entry]+'.wav -csvoutput '+path[:-19]+'mall_prosodic/'+list_dir[entry]+'.csv -instname '+list_dir[entry])
path='/home/development/shubhamm/SER/Mustard/MELD/MELD/additions/additions_context_wav/'
files = get_filelist(path)


In [None]:
def extract_mfcc(directory,list_dir):
    open_smile_path='/home/development/shubhamm/SER/'
    for entry in tqdm(range(len(list_dir))):    
        os. system('cd && cd SER && ./opensmile-2.3.0/SMILExtract -C '+open_smile_path+'opensmile-2.3.0/config/MFCC12_E_D_A_Z.conf -I '+path+list_dir[entry]+'.wav -O '+path+'mfcc/'+list_dir[entry]+'.csv')


In [None]:
extract_mfcc(path,files)
# extract_mfcc(context_path,s2_files)


100%|██████████| 690/690 [03:58<00:00,  2.89it/s]


In [None]:
# pd.read_csv('open_mfcc/'+s1_files[1]+'.csv',sep=';')


880

In [None]:
len_mapping={}
files = [csv for csv in os.listdir('mfcc/') if csv.endswith(".csv")]
for i in tqdm(files):
    len_mapping[i[:-4]]=pd.read_csv('mfcc/'+i,sep=';').drop(columns=['frameIndex','frameTime']).to_numpy().shape[0]

    

  6%|▌         | 643/11264 [00:04<01:12, 145.72it/s]


KeyboardInterrupt: 

In [None]:
len_mapping
with open('mfcc_length.pkl', 'wb') as f:
        pickle.dump(len_mapping, f, pickle.HIGHEST_PROTOCOL)

In [None]:
min(len_mapping.values()),max(len_mapping.values())

(34, 1934)

In [None]:
xx=pd.read_csv('open_mfcc/1.csv',sep=';').drop(columns=['name','frameTime']).to_numpy()
length=xx.shape[0]
np.vstack([ xx,[[0.]*39]*(3042-length+1)]).shape