# Video Segmentation using pyAudioAnalysis
This is a note notebook implementing the audio segmentation function of 
pyAudioAnalysis to cut pauses and silence out of long videos.

In [1]:
from pyAudioAnalysis import audioSegmentation as aS
from pathlib import Path
import soundfile as sf
import ipywidgets as widgets
from IPython.display import display
from ipyfilechooser import FileChooser

Place your input video file into the `./inputs` folder and choose it using the filechooser.

In [2]:
input_path = Path("./input")
fc = FileChooser(str(input_path))
display(fc)

FileChooser(path='input', filename='', title='HTML(value='', layout=Layout(display='none'))', show_hidden='Fal…

In [3]:
video_path = input_path / fc.selected_filename
output_path = Path("./outputs/") / video_path.stem
if output_path.exists():
    !rm -rf {output_path}
output_path.mkdir(parents=True, exist_ok=True)

First we must extract the audio from the input video. We do this using `ffmpeg` from the command line.

In [20]:
audio_path = output_path / (video_path.stem + ".wav")
!ffmpeg -nostats -loglevel 0 -i {video_path} {audio_path} -y

In [5]:
print("Choose window size in ms")
window_slider = widgets.FloatSlider(min=0.0, max=500.0, step=5, value=250)
display(window_slider)
    
print("Choose step size in ms")
step_slider = widgets.FloatSlider(min=0.0, max=5.0, step=0.1, value=2.9)
display(step_slider)

print("Choose strictness for silence removal. Higher -> more strict")
weight_slider = widgets.FloatSlider(min=0.0, max=1.0, step=0.1, value=0.5)
display(weight_slider)

Choose window size in ms


FloatSlider(value=250.0, max=500.0, step=5.0)

Choose step size in ms


FloatSlider(value=2.9, max=5.0)

Choose strictness for silence removal. Higher -> more strict


FloatSlider(value=0.5, max=1.0)

In [6]:
window_size = window_slider.value / 1000
step_size = step_slider.value / 1000
weight = weight_slider.value

data, fs = sf.read(audio_path)
segment_limits = aS.silence_removal(data, fs, window_size, step_size)

In [7]:
print(f"Process audio into {len(segment_limits)} segments.")
total_removed = segment_limits[-1][1] - sum([y - x for x,y in segment_limits])
print(f"This removed a total of {total_removed:.2f} seconds from the original video.")

Process audio into 175 segments.
This removed a total of 154.38 seconds from the original video.


In [8]:
import math
def s_to_hms(seconds, as_string=False):
    hours = math.floor(seconds / 3600)
    minutes = math.floor(seconds % 3600 / 60)
    s = seconds % 60
    if not as_string:
        return hours, minutes, s
    else:
        seconds_string = f"{s:.2f}" if s >= 10 else f"0{s:.2f}"
        return f"{hours:02}:{minutes:02}:{seconds_string}"

In [19]:
import subprocess
from tqdm import tqdm

for i, (start, end) in enumerate(tqdm(segment_limits, "Creating segments")):
    !ffmpeg -nostats -loglevel 0 -i {video_path} \
        -ss {s_to_hms(start, as_string=True)} -to {s_to_hms(end, as_string=True)} \
        -c:v libx264 -c:a aac {output_path}/segment_{i}.mp4 -y

Creating segments: 100%|██████████| 175/175 [10:17<00:00,  3.53s/it]


In [21]:
segment_list = output_path / "segment_list.txt"
!rm {segment_list}
with open(segment_list, 'w+') as f:
    f.writelines([f"file 'segment_{i}.mp4'\n" for i in range(len(segment_limits))])
!ffmpeg -nostats -loglevel 0 -f concat -safe 0 -i {segment_list} -c copy {output_path}/full.mp4 -y
print(f"Concatenated segments to {output_path}/full.mp4")

Concatenated segments to outputs/signale/full.mp4
