# Audio extraction in wav format from mp4 files

In [75]:
!pip install pydub
!pip install ffmpeg

Collecting ffmpeg
  Downloading ffmpeg-1.4.tar.gz (5.1 kB)
Building wheels for collected packages: ffmpeg
  Building wheel for ffmpeg (setup.py): started
  Building wheel for ffmpeg (setup.py): finished with status 'done'
  Created wheel for ffmpeg: filename=ffmpeg-1.4-py3-none-any.whl size=6083 sha256=c1a8361168cf0e3115b107d3db0a619c967b733f27ec7dd809bf2e4e35d80884
  Stored in directory: c:\users\bkoch\appdata\local\pip\cache\wheels\30\33\46\5ab7eca55b9490dddbf3441c68a29535996270ef1ce8b9b6d7
Successfully built ffmpeg
Installing collected packages: ffmpeg
Successfully installed ffmpeg-1.4


In [142]:
import os
import pydub
import librosa
import soundfile as sf
print(librosa.__version__)

0.8.1


In [84]:
DATA_FOLDER_PTH=os.path.join(os.getcwd(), os.pardir, 'data')

# raw
TRAIN_VIDEO_FOLDER_PTH=os.path.join(DATA_FOLDER_PTH, 'raw/MELD/train', 'train_splits')
TRAIN_TEXT_FILE_PTH=os.path.join(DATA_FOLDER_PTH, 'raw/MELD/train', 'train_sent_emo.csv')

DEV_VIDEO_FOLDER_PTH=os.path.join(DATA_FOLDER_PTH, 'raw/MELD/dev', 'dev_splits_complete')
DEV_TEXT_FILE_PTH=os.path.join(DATA_FOLDER_PTH, 'raw/MELD/dev', 'dev_sent_emo.csv')

TEST_VIDEO_FOLDER_PTH=os.path.join(DATA_FOLDER_PTH, 'raw/MELD/test', 'output_repeated_splits_test')
TEST_TEXT_FILE_PTH=os.path.join(DATA_FOLDER_PTH, 'raw/MELD/test', 'test_sent_emo.csv')

# processed
TRAIN_AUDIO_FOLDER_PTH=os.path.join(DATA_FOLDER_PTH, 'processed/MELD/train_wavs')
DEV_AUDIO_FOLDER_PTH=os.path.join(DATA_FOLDER_PTH, 'processed/MELD/dev_wavs')
TEST_AUDIO_FOLDER_PTH=os.path.join(DATA_FOLDER_PTH, 'processed/MELD/test_wavs')


# Inspect Video files

In [93]:
os.listdir(TRAIN_VIDEO_FOLDER_PTH)[:10]

['dia0_utt0.mp4',
 'dia0_utt1.mp4',
 'dia0_utt10.mp4',
 'dia0_utt11.mp4',
 'dia0_utt12.mp4',
 'dia0_utt13.mp4',
 'dia0_utt2.mp4',
 'dia0_utt3.mp4',
 'dia0_utt4.mp4',
 'dia0_utt5.mp4']

# Testing Audio Extraction

In [119]:
video_file_name=os.listdir(TRAIN_VIDEO_FOLDER_PTH)[0]
video_file_name_without_extension=video_file_name.split('.')[0]
video_file_pth=os.path.join(TRAIN_VIDEO_FOLDER_PTH, video_file_name)

In [126]:
os.system('ffmpeg -i {} -f wav {} '.format(video_file_pth, os.path.join(OUTPUT_FOLDER_PTH, video_file_name_without_extension+'.wav')))

0

## View output format

In [133]:
import pprint

info=pydub.utils.mediainfo(os.path.join(OUTPUT_FOLDER_PTH, video_file_name_without_extension+'.wav'))
data={
    'audio_name':audio_pth.split('.')[-2],
    'sample_rate':int(info['sample_rate']),
    'channels':int(info['channels']),
    'bits_pre_sample':int(info['bits_per_sample']),
    'duration': float(info['duration']),
    'codec_long_name':str(info['codec_long_name']),
    'size':int(info['size'])
}

pprint.pprint(data)

{'audio_name': '\\output\\dia0_utt0',
 'bits_pre_sample': 16,
 'channels': 2,
 'codec_long_name': 'PCM signed 16-bit little-endian',
 'duration': 5.674671,
 'sample_rate': 44100,
 'size': 1001090}


In [137]:
pth=os.path.join(OUTPUT_FOLDER_PTH, video_file_name_without_extension+'.wav')
y,sr=librosa.load(pth, mono=False, sr=None)
print(pth)
print(y.shape)
print(sr)

e:\machine_learning\projects\multimodal-emotion-classification\notebooks\..\output\dia0_utt0.wav
(2, 250253)
44100


## Resample with librosa test

In [144]:
y,sr=librosa.load(pth, mono=True, sr=22050)
sf.write(os.path.join(OUTPUT_FOLDER_PTH, video_file_name_without_extension+'_resample_mono.wav'), y, sr)

info=pydub.utils.mediainfo(os.path.join(OUTPUT_FOLDER_PTH, video_file_name_without_extension+'_resample_mono.wav'))
data={
    'audio_name':audio_pth.split('.')[-2],
    'sample_rate':int(info['sample_rate']),
    'channels':int(info['channels']),
    'bits_pre_sample':int(info['bits_per_sample']),
    'duration': float(info['duration']),
    'codec_long_name':str(info['codec_long_name']),
    'size':int(info['size'])
}

pprint.pprint(data)

{'audio_name': '\\output\\dia0_utt0',
 'bits_pre_sample': 16,
 'channels': 1,
 'codec_long_name': 'PCM signed 16-bit little-endian',
 'duration': 5.674694,
 'sample_rate': 22050,
 'size': 250298}


#### Note: We can resample during training with multiple workers in the background.

# Train dataset audio extraction

In [91]:
for video_file_name in os.listdir(TRAIN_VIDEO_FOLDER_PTH):
    video_file_name_without_extension=video_file_name.split('.')[0]
    video_file_pth=os.path.join(TRAIN_VIDEO_FOLDER_PTH, video_file_name)

    os.system('ffmpeg -i {} -f wav {}'.format(video_file_pth, os.path.join(TRAIN_AUDIO_FOLDER_PTH, video_file_name_without_extension+'.wav')))

# Dev dataset audio extraction

In [111]:
for video_file_name in os.listdir(DEV_VIDEO_FOLDER_PTH):
    video_file_name_without_extension=video_file_name.split('.')[0]
    video_file_pth=os.path.join(DEV_VIDEO_FOLDER_PTH, video_file_name)

    os.system('ffmpeg -i {} -f wav {}'.format(video_file_pth, os.path.join(DEV_AUDIO_FOLDER_PTH, video_file_name_without_extension+'.wav')))