## Audio Feature Extraction

**Packages**

In [1]:
import h5py
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
from tqdm import tqdm
import os.path as osp
import torch
import numpy as np
from moviepy.video.io.VideoFileClip import VideoFileClip

### Loading VGGish model

In [2]:
import torch
vggish = torch.hub.load('harritaylor/torchvggish', 'vggish')
vggish.eval()


Using cache found in C:\Users\Msc 2/.cache\torch\hub\harritaylor_torchvggish_master


VGGish(
  (features): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (11): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): ReLU(inplace=True)
    (13): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (14): ReLU(inplace=True)
    (15): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False

### Feature Extracter

In [3]:
def get_segFeats(path2vid, picks):
    audio = VideoFileClip(path2vid).audio
    if audio == None:
        Audio_feats=np.zeros((len(picks),128))
        return Audio_feats
        
    dur=audio.duration
    Audio_Feats=[]
    
    pbar = tqdm(total=len(picks), position=0, leave=True)
    
    first_sarr = audio.cutout(0.5,dur).to_soundarray(fps=16000)
    pad_amt = 16000-first_sarr.shape[0]
    first_sarr= np.pad(first_sarr, ((pad_amt,0),(0,0)), 'constant')
    feat = vggish.forward(first_sarr, fs=16000).detach().numpy()
    Audio_Feats.append(feat)
    pbar.update(1)
    
    i=0.0   ##
    for pick in picks[1:]:
        # i=pick/30           ### pick/fps
        i+=0.5
        audio_clip = audio.cutout(0,i-0.5).cutout(min(i+0.5,dur),dur)
        audio_arr = audio_clip.to_soundarray(fps=16000)
        pad_amt = 16000-audio_arr.shape[0]
        if pad_amt>0:
            audio_arr= np.pad(audio_arr, ((0, pad_amt),(0,0)), 'constant')
        
        feat = vggish.forward(audio_arr, fs=16000).detach().numpy()
        Audio_Feats.append(feat)
        pbar.update(1)
        
    return Audio_Feats
        

In [4]:
def extract_audio_feats(args):
    with h5py.File(args['output_h5'], 'a') as d:
        for key in d.keys():
            if 'aud_feats' in d[key].keys():
                continue
            print("Extracting Audio Feature from : ", key)
            video_name=d[key+'/video_name'][()].decode()
            path2vid = args['video_folder']+'/'+video_name
            picks = d[key+'/picks'][()]
            
            audio_feats = np.array(get_segFeats(path2vid, picks))

            print('Feature Shape: ', audio_feats.shape)
            
            d.create_dataset(key + '/aud_feats', data=audio_feats)
            
            

### TVSum Audio Feature Extraction

In [22]:
# args={
#     'output_h5':'extracted_features/normal/TVSum05s.h5',
#     'video_folder':'../data/Public datasets/ydata-tvsum50-v1_1/video'
# }

In [24]:
# extract_audio_feats(args)

Extracting Audio Feature from :  video_1


100%|██████████| 708/708 [00:29<00:00, 23.60it/s]


Feature Shape:  (708, 128)
Extracting Audio Feature from :  video_10


100%|██████████| 267/267 [00:11<00:00, 24.06it/s]


Feature Shape:  (267, 128)
Extracting Audio Feature from :  video_11


100%|██████████| 314/314 [00:12<00:00, 24.27it/s]


Feature Shape:  (314, 128)
Extracting Audio Feature from :  video_12


100%|██████████| 902/902 [00:37<00:00, 23.82it/s]


Feature Shape:  (902, 128)
Extracting Audio Feature from :  video_13


100%|██████████| 283/283 [00:11<00:00, 24.03it/s]


Feature Shape:  (283, 128)
Extracting Audio Feature from :  video_14


100%|██████████| 389/389 [00:14<00:00, 25.94it/s]


Feature Shape:  (389, 128)
Extracting Audio Feature from :  video_15


100%|██████████| 289/289 [00:11<00:00, 25.12it/s]


Feature Shape:  (289, 128)
Extracting Audio Feature from :  video_16


100%|██████████| 796/796 [00:31<00:00, 25.03it/s]


Feature Shape:  (796, 128)
Extracting Audio Feature from :  video_17


100%|██████████| 488/488 [00:19<00:00, 25.03it/s]


Feature Shape:  (488, 128)
Extracting Audio Feature from :  video_18


100%|██████████| 811/811 [00:33<00:00, 24.38it/s]


Feature Shape:  (811, 128)
Extracting Audio Feature from :  video_19


100%|██████████| 384/384 [00:15<00:00, 25.24it/s]


Feature Shape:  (384, 128)
Extracting Audio Feature from :  video_2


100%|██████████| 376/376 [00:14<00:00, 26.07it/s]


Feature Shape:  (376, 128)
Extracting Audio Feature from :  video_20


100%|██████████| 521/521 [00:19<00:00, 26.32it/s]


Feature Shape:  (521, 128)
Extracting Audio Feature from :  video_21


100%|██████████| 1296/1296 [00:52<00:00, 24.86it/s]


Feature Shape:  (1296, 128)
Extracting Audio Feature from :  video_22


100%|██████████| 472/472 [00:18<00:00, 25.57it/s]


Feature Shape:  (472, 128)
Extracting Audio Feature from :  video_23


100%|██████████| 376/376 [00:15<00:00, 23.55it/s]


Feature Shape:  (376, 128)
Extracting Audio Feature from :  video_24


100%|██████████| 291/291 [00:12<00:00, 23.47it/s]


Feature Shape:  (291, 128)
Extracting Audio Feature from :  video_25


100%|██████████| 549/549 [00:23<00:00, 23.41it/s]


Feature Shape:  (549, 128)
Extracting Audio Feature from :  video_26


100%|██████████| 222/222 [00:08<00:00, 24.80it/s]


Feature Shape:  (222, 128)
Extracting Audio Feature from :  video_27


100%|██████████| 729/729 [00:29<00:00, 24.61it/s]


Feature Shape:  (729, 128)
Extracting Audio Feature from :  video_28


100%|██████████| 553/553 [00:23<00:00, 23.63it/s]


Feature Shape:  (553, 128)
Extracting Audio Feature from :  video_29


100%|██████████| 1172/1172 [00:46<00:00, 25.05it/s]


Feature Shape:  (1172, 128)
Extracting Audio Feature from :  video_3


100%|██████████| 1170/1170 [00:49<00:00, 23.82it/s]


Feature Shape:  (1170, 128)
Extracting Audio Feature from :  video_30


100%|██████████| 335/335 [00:14<00:00, 23.73it/s]


Feature Shape:  (335, 128)
Extracting Audio Feature from :  video_31


100%|██████████| 433/433 [00:17<00:00, 25.44it/s]


Feature Shape:  (433, 128)
Extracting Audio Feature from :  video_32


100%|██████████| 305/305 [00:12<00:00, 24.27it/s]


Feature Shape:  (305, 128)
Extracting Audio Feature from :  video_33


100%|██████████| 892/892 [00:37<00:00, 23.92it/s]


Feature Shape:  (892, 128)
Extracting Audio Feature from :  video_34


100%|██████████| 310/310 [00:12<00:00, 24.42it/s]


Feature Shape:  (310, 128)
Extracting Audio Feature from :  video_35


100%|██████████| 298/298 [00:11<00:00, 25.32it/s]


Feature Shape:  (298, 128)
Extracting Audio Feature from :  video_36


100%|██████████| 532/532 [00:20<00:00, 25.92it/s]


Feature Shape:  (532, 128)
Extracting Audio Feature from :  video_37


100%|██████████| 335/335 [00:12<00:00, 25.78it/s]


Feature Shape:  (335, 128)
Extracting Audio Feature from :  video_38


100%|██████████| 197/197 [00:08<00:00, 23.36it/s]


Feature Shape:  (197, 128)
Extracting Audio Feature from :  video_39


100%|██████████| 278/278 [00:12<00:00, 23.03it/s]


Feature Shape:  (278, 128)
Extracting Audio Feature from :  video_4


100%|██████████| 577/577 [00:23<00:00, 24.33it/s]


Feature Shape:  (577, 128)
Extracting Audio Feature from :  video_40


100%|██████████| 762/762 [00:30<00:00, 25.15it/s]


Feature Shape:  (762, 128)
Extracting Audio Feature from :  video_41


100%|██████████| 539/539 [00:22<00:00, 23.95it/s]


Feature Shape:  (539, 128)
Extracting Audio Feature from :  video_42


100%|██████████| 397/397 [00:16<00:00, 23.89it/s]


Feature Shape:  (397, 128)
Extracting Audio Feature from :  video_43


100%|██████████| 330/330 [00:13<00:00, 23.99it/s]


Feature Shape:  (330, 128)
Extracting Audio Feature from :  video_44


100%|██████████| 288/288 [00:11<00:00, 24.37it/s]


Feature Shape:  (288, 128)
Extracting Audio Feature from :  video_45


100%|██████████| 209/209 [00:08<00:00, 24.81it/s]


Feature Shape:  (209, 128)
Extracting Audio Feature from :  video_46


100%|██████████| 1021/1021 [00:42<00:00, 24.11it/s]


Feature Shape:  (1021, 128)
Extracting Audio Feature from :  video_47


100%|██████████| 380/380 [00:16<00:00, 23.19it/s]


Feature Shape:  (380, 128)
Extracting Audio Feature from :  video_48


100%|██████████| 260/260 [00:10<00:00, 23.77it/s]


Feature Shape:  (260, 128)
Extracting Audio Feature from :  video_49


100%|██████████| 399/399 [00:16<00:00, 24.34it/s]


Feature Shape:  (399, 128)
Extracting Audio Feature from :  video_5


100%|██████████| 223/223 [00:09<00:00, 24.11it/s]


Feature Shape:  (223, 128)
Extracting Audio Feature from :  video_50


100%|██████████| 461/461 [00:19<00:00, 23.37it/s]


Feature Shape:  (461, 128)
Extracting Audio Feature from :  video_6


100%|██████████| 646/646 [00:26<00:00, 24.31it/s]


Feature Shape:  (646, 128)
Extracting Audio Feature from :  video_7


100%|██████████| 298/298 [00:12<00:00, 24.31it/s]


Feature Shape:  (298, 128)
Extracting Audio Feature from :  video_8


100%|██████████| 659/659 [00:27<00:00, 23.97it/s]


Feature Shape:  (659, 128)
Extracting Audio Feature from :  video_9


100%|██████████| 468/468 [00:18<00:00, 25.59it/s]

Feature Shape:  (468, 128)





### Summe Audio Feature Extraction

In [18]:
# args={
#     'output_h5':'extracted_features/normal/SUMMe.h5',
#     'video_folder':'../data/Public datasets/SUMMe/videos'
# }

In [21]:
# extract_audio_feats(args)

Extracting Audio Feature from :  video_1


100%|██████████| 299/299 [00:10<00:00, 28.17it/s]


Feature Shape:  (299, 128)
Extracting Audio Feature from :  video_10


100%|██████████| 331/331 [00:12<00:00, 27.58it/s]


Feature Shape:  (331, 128)
Extracting Audio Feature from :  video_11


100%|██████████| 648/648 [00:24<00:00, 26.47it/s]


Feature Shape:  (648, 128)
Extracting Audio Feature from :  video_12


100%|██████████| 107/107 [00:04<00:00, 23.77it/s]


Feature Shape:  (107, 128)
Extracting Audio Feature from :  video_13


100%|██████████| 63/63 [00:02<00:00, 25.23it/s]


Feature Shape:  (63, 128)
Extracting Audio Feature from :  video_14


100%|██████████| 212/212 [00:08<00:00, 24.01it/s]


Feature Shape:  (212, 128)
Extracting Audio Feature from :  video_15


100%|██████████| 307/307 [00:11<00:00, 25.87it/s]


Feature Shape:  (307, 128)
Extracting Audio Feature from :  video_16


100%|██████████| 406/406 [00:14<00:00, 27.42it/s]


Feature Shape:  (406, 128)
Extracting Audio Feature from :  video_17


100%|██████████| 171/171 [00:06<00:00, 26.78it/s]


Feature Shape:  (171, 128)
Extracting Audio Feature from :  video_18


100%|██████████| 207/207 [00:07<00:00, 27.98it/s]


Feature Shape:  (207, 128)
Extracting Audio Feature from :  video_19


100%|██████████| 204/204 [00:07<00:00, 26.56it/s]


Feature Shape:  (204, 128)
Extracting Audio Feature from :  video_2


100%|██████████| 315/315 [00:11<00:00, 27.12it/s]


Feature Shape:  (315, 128)
Extracting Audio Feature from :  video_20


100%|██████████| 445/445 [00:16<00:00, 27.14it/s]


Feature Shape:  (445, 128)
Extracting Audio Feature from :  video_21
Feature Shape:  (148, 128)
Extracting Audio Feature from :  video_22
Feature Shape:  (116, 128)
Extracting Audio Feature from :  video_23


100%|██████████| 257/257 [00:09<00:00, 26.91it/s]


Feature Shape:  (257, 128)
Extracting Audio Feature from :  video_24


100%|██████████| 644/644 [00:24<00:00, 26.56it/s]


Feature Shape:  (644, 128)
Extracting Audio Feature from :  video_25


100%|██████████| 345/345 [00:12<00:00, 27.16it/s]


Feature Shape:  (345, 128)
Extracting Audio Feature from :  video_3


100%|██████████| 222/222 [00:08<00:00, 27.03it/s]


Feature Shape:  (222, 128)
Extracting Audio Feature from :  video_4


100%|██████████| 204/204 [00:07<00:00, 27.34it/s]


Feature Shape:  (204, 128)
Extracting Audio Feature from :  video_5


100%|██████████| 342/342 [00:12<00:00, 26.48it/s]


Feature Shape:  (342, 128)
Extracting Audio Feature from :  video_6


100%|██████████| 292/292 [00:11<00:00, 26.33it/s]


Feature Shape:  (292, 128)
Extracting Audio Feature from :  video_7


100%|██████████| 338/338 [00:12<00:00, 26.23it/s]


Feature Shape:  (338, 128)
Extracting Audio Feature from :  video_8


100%|██████████| 603/603 [00:22<00:00, 27.35it/s]


Feature Shape:  (603, 128)
Extracting Audio Feature from :  video_9


100%|██████████| 85/85 [00:02<00:00, 28.65it/s]

Feature Shape:  (85, 128)





--------------

In [5]:
args={
    'output_h5':'extracted_features/normal/customSet.h5',
    'video_folder':'../data/Custom dataset'
}
extract_audio_feats(args)

Extracting Audio Feature from :  video_1


100%|██████████| 235/235 [00:08<00:00, 26.30it/s]


Feature Shape:  (235, 128)
Extracting Audio Feature from :  video_2


100%|██████████| 323/323 [00:12<00:00, 25.97it/s]


Feature Shape:  (323, 128)
Extracting Audio Feature from :  video_3


100%|██████████| 9749/9749 [06:35<00:00, 24.67it/s]


Feature Shape:  (9749, 128)
Extracting Audio Feature from :  video_4


100%|██████████| 884/884 [00:37<00:00, 23.67it/s]


Feature Shape:  (884, 128)
Extracting Audio Feature from :  video_5


100%|██████████| 354/354 [00:14<00:00, 25.25it/s]

Feature Shape:  (354, 128)





In [17]:
import torch
torch.__version__

'2.0.0'

In [3]:
# audio = VideoFileClip('../data/Public datasets/ydata-tvsum50-v1_1/video/sTEELN-vY30.mp4').audio


# dur=audio.duration


# first_sarr = audio.cutout(0.5,dur).to_soundarray(fps=16000)