In [None]:
%pip install moviepy

In [None]:
import os
import numpy as np
import pandas as pd
from moviepy.editor import VideoFileClip

In [None]:
from preprocessing import create_spectrogram

# Clip extraction

*Given a start and end time (i.e. for a piece, movement - the smallest 'continuous' segment of music)* \
*Window = 10 sec*

Method 1. Sliding window, no overlap \
Method 2. Sliding window, overlap (___ sec) \
Method 3. Random sampling (____ clips per ____ sec segment)

## Notes

1. Videos are in 720p if possible, otherwise whatever's the highest i can get

In [None]:
video_dir = "data/"
filename = "1234567.mp4"
vid = VideoFileClip(os.path.join(video_dir, filename))

In [None]:
root_dir = "data"
clip_dir = os.path.join(root_dir, "clips")
audio_dir = os.path.join(root_dir, "audio")

if not os.path.exists(clip_dir):
    os.makedirs(clip_dir)

if not os.path.exists(audio_dir):
    os.makedirs(audio_dir)

In [None]:
video_fps = 30

## Segments so I don't forget

Current total (approx): 7 hr 50 min

Fast:
1. La campanella: 0:05 to 4:25
2. scarbo: 0:10 to 8:24
3. 

Slow:
1. tristesse: 0:05 to 4:10
2. arabesque 1 (debussy): 0:00 to 4:21
3. handel g minor minuet: 0:00 to 3:31
4. 

bit of both:
1. chopin ballade: 0:05 to 9:25
2. hammerklavier (full): 0:00 to 10:32, 10:39 to 13:09, 13:15 to 28:45, 29:00 to 39:29
3. goldberg variations (full): 0:05 to 1:23:22
4. italian concerto (full): 0:11 to 3:52, 4:00 to 8:06, 8:51 to 11:54
5. bach WTC book 1 (full): 0:24 to 2:05:15
6. appassionata (full): 0:02 to 9:27, 9:37 to 22:47
7. pathetique (full): 0:12 to 8:44, 9:04 to 13:40, 13:49 to 18:17
8. waldstein (full): 0:53 to 9:10, 9:31 to 9:43, 10:24 to 12:00, 12:25 to 13:29, 14:12 to 23:45
9. moonlight sonata (full): 0:46 to 9:02, 9:09 to 14:46
10. pastoral sonata (full): 0:09 to 9:26, 9:35 to 16:16, 16:26 to 23:01
11. mozart sonata 8 (full): 0:25 to 5:55, 6:13 to 15:38, 15:48 to 18:42
12. mozart sonata 1 (full): 0:39 to 7:22, 7:33 to 14:50, 15:05 to 20:02
13. mozart sonata 3 (full): 0:02 to 6:35, 6:41 to 13:41, 13:47 to 18:24
14. mozart sonata 12 (full): 0:05 to 4:57, 5:15 to 9:48, 10:00 to 15:16
15. mozart sonata 6 (full): 0:03 to 3:41, 3:48 to 9:39, 9:49 to 22:04
16. mozart sonata 10 (full): 0:11 to 2:36, 2:56 to 4:43, 5:03 to 6:01, 6:14 to 11:38, 12:02 to 15:26
17. 

AIM FOR 30 HOURS (a bit over 10k samples)

In [None]:
# Define segment of video to extract from
start_sec = 0
end_sec = 0

dur = 10
audio_subclip_dur = 2  # duration of audio subclips (for video llama) in seconds

## Method 1

In [None]:
fname = filename.split('.')[0]
clip_counter = 0

for i in range(start_sec, end_sec, dur):
    
    if (end_sec - i) < dur:
        break

    # save video clip
    # NOTE: just saving the 10 sec clips
        # the preprocessing in video llama repo extracts every n_frms frames
        # ill try first with just every frame
    clip = vid.subclip(i, i+dur)
    clip.write_videofile(os.path.join(clip_dir, f'{fname}_{clip_counter}.mp4'), fps=30, audio=False)

    # save audio clip
    # NOTE: just saving the 10 sec clips
        # the preprocessing in the video llama repo extracts consecutive audio subclips
        # which i'll set to 5 subclips of 2 sec each (to span the entire audio clip)
    clip_audio = clip.audio
    clip_audio.write_audiofile(os.path.join(audio_dir, f'{fname}_{clip_counter}.wav'), fps=44100, codec="pcm_s16le")
    
    '''for j in range(0, dur, audio_subclip_dur):

        # save audio subclip as wav
        audio_subclip = clip_audio.subclip(j, j+audio_subclip_dur)
        audio_subclip_path = os.path.join(audio_dir, fname, str(clip_counter), str(int(j / audio_subclip_dur)) + ".wav")
        audio_subclip.write_audiofile(audio_subclip_path)

        # extract and save spectrogram
        create_spectrogram(audio_subclip_path)

        # delete wav
        os.remove(audio_subclip_path)'''

    # extract, preprocess, and save frames
    # TODO: optical flow, contouring?
    '''frame_counter = 0
    for frame in clip.iter_frames(fps=video_fps):
        # frame is a H x W x N (N = 3 for RGB) np.array
        frame_path = os.path.join(clip_vid_dir, str(frame_counter) + ".npy")
        np.save(frame_path, frame)
        frame_counter += 1'''

    clip_counter += 1

## Create csv for Dataset

In [None]:
val_prop = 0.1
# ill get separate test data later (can be more isolated clips and such)

In [None]:
clip_paths = []
audio_paths = []

for file in os.listdir(clip_dir):
    fname = file.split('.')[0]
    if os.path.exists(os.path.join(audio_dir, f'{fname}.wav')):
        clip_paths.append(os.path.join(clip_dir, file))
        audio_paths.append(os.path.join(audio_dir, f'{fname}.wav'))
    else:
        print(f'No audio file for clip {file}')

df = pd.DataFrame({'clip_path': clip_paths, 'audio_path': audio_paths})
train_df = df.sample(frac=(1-val_prop), replace=False, random_state=0)
val_df = df[~df['clip_path'].isin(train_df['clip_path'])]

train_df.to_csv(os.path.join(root_dir, 'train_ds.csv'))
val_df.to_csv(os.path.join(root_dir, 'val_ds.csv'))