In [2]:
import librosa
import librosa.display
import numpy as np
import math
import matplotlib.pyplot as plt
import numpy as np
import cv2 as cv2
from visual_clutter import Vlc
from tensorflow import keras
import pandas as pd
import psycopg2
import json
import os

OMP: Info #271: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


This notebook gets a video and extracts features from it and inserts the results of the analysis into the db_concert_analyses table. 

Prerequisites:
- Having the database up and running
- Having related concert and movie entries in db_concerts & db_movies tables. As the metadata regarding the concerts and songs are not structured, these entries must be done by hand so far.
- Knowing the id of the inserted "movie" in the db_movies table. The movie refers to a song video in our case. The id will be used to connect the analysis to the movie.
- Knowing the path of the song video
- Knowing the path to VIAN folder to add screenshots (not a must)

Library Requirements:
- Essentia: I have manually installed it following (https://essentia.upf.edu/installing.html). Tensorflow module of essentia is necessary!
- librosa: https://librosa.org/doc/latest/install.html
- visual-clutter: https://github.com/kargaranamir/visual-clutter

In [20]:
video_path = "data/Got Me Under Pressure.mp4"
vian_path = "/Users/uensal/Documents/melike/livemusicvis/VIAN-Web2"
#id of the related movie in the db_movies table
id = 560

Since the structure of the data archive is not certain I only left this example piece of code to iterate over a folder and its subdirectories.

In [1]:
rootdir = './'

for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        print(os.path.join(subdir, file))

./classes.txt
./.DS_Store
./similarity metric calculation.ipynb
./extract features and insert into db.ipynb
./audioset-yamnet-1.pb
./metadata analysis (to be run after features).ipynb
./EfficientNetB3-instruments-99.33.h5
./instruments.csv
./class_dict.csv
./.ipynb_checkpoints/extract features and insert into db-checkpoint.ipynb


In [5]:
# from https://github.com/chuckcho/camera-motion-detection/blob/master/cam_detect.py
import numpy as np
import math

try:
    import cv2
    # normalize some property names across opencv versions
    try:
        from cv2 import CAP_PROP_FPS
        from cv2 import CAP_PROP_FRAME_COUNT
    except ImportError:
        from cv2.cv import CV_CAP_PROP_FPS as CAP_PROP_FPS
        from cv2.cv import CV_CAP_PROP_FRAME_COUNT as CAP_PROP_FRAME_COUNT
except ImportError as import_error:
    #LOGGER.info('%s | calculate_motion_and_jitterness: Running on a non-CUDA '
    print('[error] calculate_motion_and_jitterness: Running on a non-CUDA '
        'server.')

In [None]:
# from https://github.com/chuckcho/camera-motion-detection/blob/master/cam_detect.py
def find_dominant_mag_ang(flow):
    """
    Find a dominant magnitude and angle given optical flow map
    """
    mag_map, ang_map = cv2.cartToPolar(flow[..., 0], flow[..., 1])

    # If mean(mag) >= thresh1 and std(mag) <= thresh2, this magnitude is
    # considered "dominant"
    min_mag_mean = 0.05 * mag_map.shape[0]/50
    # max mag std deviation relative to mean
    max_mag_std = 1.0
    max_ang_std = 0.8

    #mag_mean = np.mean(mag_map)
    mag_mean = np.mean(mag_map)
    mag_std = np.std(mag_map)
    if mag_mean >= min_mag_mean and mag_std <= max_mag_std * mag_mean:
        dom_mag = mag_mean
    else:
        dom_mag = float('nan')

    # If std(ang) <= thresh3, this angle is considered "dominant"
    # Take cos() to wrap inherently circular angle (0~2*pi, 0=2*pi)
    ang_std = np.std(np.cos(ang_map))
    if ang_std <= max_ang_std:
        #dom_ang = np.mean(ang_map) * 180 / np.pi
        dom_ang = np.median(ang_map) * 180 / np.pi
    else:
        dom_ang = float('nan')

    # Only if both dom_mag and dom_ang are good, this frame is good
    #if math.isnan(dom_mag):
    #    dom_ang = float('nan')
    #if math.isnan(dom_ang):
    #    dom_mag = float('nan')

    return dom_mag, dom_ang

def detect_pan_tilt_zoom(videofile, OF_overlay_videofile=None):
    """
    Detect Pan/Tilt/Zoom camera motion separately
    """

    # display images for debugging/troubleshooting
    visualize = False
    debug = False

    # frames per second (skip other frames)
    # process only every n-th frame
    sampling_rate = 1

    # image resize ratio
    resize_ratio = 0.5

    # will ignore short segments of frames in motion (likely to be noisy)
    min_consecutive_frames = 5

    # get FPS
    cap = cv2.VideoCapture(videofile)
    fps = cap.get(CAP_PROP_FPS)

    # if unavailable, by default 30.0
    if fps <= 0.0 or math.isnan(fps):
        fps = 30.0

    # read first frame and resize
    ret, frame1 = cap.read()
    frame1 = cv2.resize(frame1, (0, 0), fx=resize_ratio, fy=resize_ratio)
    previous_frame = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)

    if visualize:
        hsv = np.zeros_like(frame1)
        hsv[..., 1] = 255

    frame_nums = []
    timestamps = []
    cummulative_dom_mag = []
    cummulative_dom_ang = []
    frame_num = 1
    count = 1

    if visualize:
        plot_window_size = 500
        cv2.namedWindow('original', cv2.WINDOW_NORMAL)
        cv2.namedWindow('optical flow', cv2.WINDOW_NORMAL)
        cv2.namedWindow('dominant mag(OF)', cv2.WINDOW_NORMAL)
        cv2.namedWindow('dominant ang(OF)', cv2.WINDOW_NORMAL)



    while 1:
        # read subsequent frame
        ret, frame2 = cap.read()

        # check for end of video
        if not ret:
            if visualize:
                k = cv2.waitKey(0)
            break

        # skip frames
        #if frame_num % int(round(fps/sampling_rate)) != 0:
        if frame_num % sampling_rate != 0:
            frame_num += 1
            continue

        # resize
        frame2 = cv2.resize(frame2, (0, 0), fx=resize_ratio, fy=resize_ratio)
        next_frame = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)

        # get optical flow
        # refer to http://docs.opencv.org/modules/video/doc/motion_analysis_and_object_tracking.html#calcopticalflowfarneback
        # for details about each parameter
        flow = cv2.calcOpticalFlowFarneback(
                prev=previous_frame,
                next=next_frame,
                flow=None,
                pyr_scale=0.5,
                levels=3,
                winsize=15,
                iterations=3,
                poly_n=5,
                poly_sigma=1.2,
                flags=0
                )

        mag, ang = cv2.cartToPolar(flow[..., 0], flow[..., 1])

        if visualize:
            hsv[..., 0] = ang*180/np.pi/2
            hsv[..., 2] = cv2.normalize(mag, None, 0, 255, cv2.NORM_MINMAX)
            bgr = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)

        # find majority angle and magnitute
        dom_mag, dom_ang = find_dominant_mag_ang(flow)

        cummulative_dom_mag.append(dom_mag)
        cummulative_dom_ang.append(dom_ang)
        timestamp = frame_num / fps
        frame_nums.append(frame_num)
        timestamps.append(timestamp)

        previous_frame = next_frame
        frame_num += 1
        count += 1

         # if enabled, will display (1) original image, (2) optical flow image,
        # and (3) history of dominant optical flow angles
        if visualize:
            cummulative_dom_mag_img = np.zeros(
                                        (180, plot_window_size, 3),
                                        np.uint8
                                        )
            for i in range(max(0, count - plot_window_size), count):
                cv2.circle(
                    cummulative_dom_mag_img,
                    (
                    count-i,
                    cummulative_dom_mag_img.shape[0] - max(
                            int(cummulative_dom_mag[i])*10, 0)
                    ),
                    1, (0, 0, 255), 1)
            cummulative_dom_ang_img = np.zeros(
                                        (180, plot_window_size, 3),
                                        np.uint8
                                        )
            for i in range(max(0, count - plot_window_size), count):
                cv2.circle(
                    cummulative_dom_ang_img,
                    (
                    count-i,
                    cummulative_dom_ang_img.shape[0] - max(
                            int(cummulative_dom_ang[i]), 0)
                    ),
                    1, (0, 0, 255), 1)
            cv2.imshow('original', frame2)
            cv2.imshow('optical flow', bgr)
            cv2.imshow('dominant mag(OF)', cummulative_dom_mag_img)
            cv2.imshow('dominant ang(OF)', cummulative_dom_ang_img)

            k = cv2.waitKey(30) & 0xff
            if k == 27:
                break

        previous_frame = next_frame
        frame_num += 1
        count += 1

    cap.release()

    # detect pan/tilt/zoom for each frame from dom_mag and dom_ang's (only
    # if they persists in some consecutive frames)

    # dealing with numpy array is easier than python list
    cummulative_dom_mag = np.array(cummulative_dom_mag)
    cummulative_dom_ang = np.array(cummulative_dom_ang)
    pan = np.array([False] * len(frame_nums))
    tilt = np.array([False] * len(frame_nums))
    zoom = np.array([False] * len(frame_nums))

    for count, frame in enumerate(frame_nums[:-(min_consecutive_frames-1)]):
        if all(np.isfinite(
                cummulative_dom_ang[count:count+min_consecutive_frames]
                )) and all(np.isfinite(
                cummulative_dom_mag[count:count+min_consecutive_frames]
                )):
            this_clip_pan_or_tilt = True
        else:
            this_clip_pan_or_tilt = False
        if this_clip_pan_or_tilt:
            # check if dominant angle was vertical (tilt) or horizontal (pan)

            #mean_dom_ang = np.mean(cummulative_dom_ang[count:count+min_consecutive_frames])
            mean_dom_ang = np.median(cummulative_dom_ang[count:count+min_consecutive_frames])
            std_dom_ang = np.std(cummulative_dom_ang[count:count+min_consecutive_frames])
            if std_dom_ang > 35:
                # don't tag this frame with either tilt nor pan
                pass
            if (45 + 20 <= mean_dom_ang <= 135 - 20) or (225 + 20 <= mean_dom_ang <= 315 - 20):
                tilt[count:count+min_consecutive_frames] = True
                pan[count:count+min_consecutive_frames] = False
            else:
                tilt[count:count+min_consecutive_frames] = False
                pan[count:count+min_consecutive_frames] = True

        else:
            mean_dom_ang = np.nan

        if debug:
            print("[debug] f={}, t={}, dom_mag={}, dom_ang={}, mean_dom_ang={}, pan={}, tilt={}".format(
                    frame,
                    timestamps[count],
                    cummulative_dom_mag[count],
                    cummulative_dom_ang[count],
                    mean_dom_ang,
                    pan[count],
                    tilt[count]))

    if OF_overlay_videofile:
        fourcc = cv2.VideoWriter_fourcc(*'XVID')
        out = cv2.VideoWriter(OF_overlay_videofile, fourcc, fps/sampling_rate*2, frame1.shape[1::-1])

        # get FPS
        cap = cv2.VideoCapture(videofile)

        # read first frame and resize
        ret, frame1 = cap.read()
        frame1 = cv2.resize(frame1, (0, 0), fx=resize_ratio, fy=resize_ratio)
        previous_frame = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)

        frame_num = 1
        count = 0

        while 1:
            # read subsequent frame
            ret, frame2 = cap.read()

            # check for end of video
            if not ret:
                break

            # skip frames
            #if frame_num % int(round(fps/sampling_rate)) != 0:
            if frame_num % sampling_rate != 0:
                frame_num += 1
                continue

            # resize
            frame2 = cv2.resize(frame2, (0, 0), fx=resize_ratio, fy=resize_ratio)
            if OF_overlay_videofile:
                tmp_frame = frame2
                font = cv2.FONT_HERSHEY_SIMPLEX
                (width, height) = frame2.shape[1::-1]
                if tilt[count]:
                    cv2.putText(tmp_frame,'Tilt',(10,100), font, 1,(0,0,255),2,cv2.LINE_AA)
                elif pan[count]:
                    cv2.putText(tmp_frame,'Pan',(10,100), font, 1,(0,255,255),2,cv2.LINE_AA)
                out.write(tmp_frame)

            next_frame = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)

            # get optical flow
            # refer to http://docs.opencv.org/modules/video/doc/motion_analysis_and_object_tracking.html#calcopticalflowfarneback
            # for details about each parameter
            flow = cv2.calcOpticalFlowFarneback(
                    prev=previous_frame,
                    next=next_frame,
                    flow=None,
                    pyr_scale=0.5,
                    levels=3,
                    winsize=15,
                    iterations=3,
                    poly_n=5,
                    poly_sigma=1.2,
                    flags=0
                    )

            mag, ang = cv2.cartToPolar(flow[..., 0], flow[..., 1])

            # overlay OF fields
            if OF_overlay_videofile:
                pass

            previous_frame = next_frame
            frame_num += 1
            count += 1

        cap.release()

    return (pan,
            tilt,
            zoom,
            frame_nums,
            timestamps,
            cummulative_dom_mag,
            cummulative_dom_ang, fps)

def detection(video_path):
    (pan, tilt, zoom, frame_nums, timestamps, dom_mag, dom_ang, fps) = detect_pan_tilt_zoom(video_path, OF_overlay_videofile=None)

    # human-friendly print out: video, pan, tilt, zoom
    camera_motion_perframe = dict()

    'frame_num, time in sec, dominant OF mag, dominant OF ang, pan, tilt, zoom\n'
    for count, frame in enumerate(frame_nums[:-2]):
        motions = list()
        if pan[count]:
            motions.append("pan")
        if tilt[count]:
            motions.append("tilt")
        if zoom[count]:
            motions.append("zoom")
        # frame_nums[count] = frame number
        camera_motion_perframe[frame_nums[count]] = motions
        
    return camera_motion_perframe, fps


In [22]:
camera_motion_perframe, fps = detection(video_path)

Classes for instrument detection -> see next lines of code

class_index,class,height,width,scale by
0,Didgeridoo,224,224,1
1,Tambourine,224,224,1
2,Xylophone,224,224,1
3,acordian,224,224,1
4,alphorn,224,224,1
5,bagpipes,224,224,1
6,banjo,224,224,1
7,bongo drum,224,224,1
8,casaba,224,224,1
9,castanets,224,224,1
10,clarinet,224,224,1
11,clavichord,224,224,1
12,concertina,224,224,1
13,drums,224,224,1
14,dulcimer,224,224,1
15,flute,224,224,1
16,guiro,224,224,1
17,guitar,224,224,1
18,harmonica,224,224,1
19,harp,224,224,1
20,marakas,224,224,1
21,ocarina,224,224,1
22,piano,224,224,1
23,saxaphone,224,224,1
24,sitar,224,224,1
25,steel drum,224,224,1
26,trombone,224,224,1
27,trumpet,224,224,1
28,tuba,224,224,1
29,violin,224,224,1

In [23]:
# Model from https://www.kaggle.com/code/gpiosenka/explore-instruments-data-set/data
model=keras.models.load_model("EfficientNetB3-instruments-99.33.h5") # # https://www.kaggle.com/code/gpiosenka/explore-instruments-data-set
class_df=pd.read_csv("class_dict.csv") 
class_count=len(class_df['class'].unique())
img_height=int(class_df['height'].iloc[0])
img_width =int(class_df['width'].iloc[0])
img_size=(img_width, img_height)

Classes of interest:
class_index,class,height,width,scale by
0,Didgeridoo,224,224,1
4,alphorn,224,224,1
5,bagpipes,224,224,1
6,banjo,224,224,1
7,bongo drum,224,224,1
9,castanets,224,224,1
11,clavichord,224,224,1
14,dulcimer,224,224,1
16,guiro,224,224,1
21,ocarina,224,224,1
24,sitar,224,224,1

In [24]:
indexes =[0,4,5,6,7,9,11,14,16,21,24]
filtered_classes = [i for j, i in enumerate(class_df["class"]) if j not in indexes]

In [25]:
cam = cv2.VideoCapture(video_path)
fps = cam.get(CAP_PROP_FPS)

currentframe = 0
clutter_scalars = list()
frames = list()

musical_instruments = list()
inst_frames = list()

while True:
    ret,frame = cam.read()
    if ret: 
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        image = cv2.resize(image, img_size)
        pred = model.predict(np.array(list([image])), verbose=0)
        pred = pred.flatten()
            
        filtered_predictions = [i for j, i in enumerate(pred) if j not in indexes]
            
        test_index=np.argmax(filtered_predictions)
        if filtered_predictions[test_index]>=0.5:
            musical_instruments.append(test_index)
            inst_frames.append(currentframe)
            
        if currentframe%10==1:
            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image = cv2.resize(image, (0, 0), fx=0.5, fy=0.5)
            clt = Vlc(image, numlevels=3, contrast_filt_sigma=1, contrast_pool_sigma=3, color_pool_sigma=3)
            clutter_scalar_fc, clutter_map_fc= clt.getClutter_FC()
            clutter_scalars.append(clutter_scalar_fc)
            frames.append(currentframe)
            
        currentframe += 1
    else:
        break
cam.release()
cv2.destroyAllWindows()

In [26]:
detected_instruments = list(set(musical_instruments))

In [27]:
index = 0
musical_instruments = np.array(musical_instruments)
instrument_names = list()
for instrument in detected_instruments:
    np.place(musical_instruments, musical_instruments==instrument, index)
    instrument_names.append(filtered_classes[instrument])
    index +=1

In [28]:
camx = []
camy = []
for (key, value) in camera_motion_perframe.items():
    for motion in value:
        camx.append(key)
        camy.append(motion.title())

In [29]:
new_arr = []

for instrument in musical_instruments:
    new_arr.append(instrument_names[instrument])


In [30]:
instx = []
insty = []

for (n, instrument) in enumerate(new_arr):
    if new_arr.count(instrument) > len(new_arr)*0.01:
        instx.append(inst_frames[n])
        if instrument== "saxaphone":
            instrument = "Saxophone"
        elif instrument == 'steel drum':
            instrument = "Drums"
        elif instrument == 'acordian':
            instrument = "Accordion"
        elif instrument == 'casaba':
            instrument = "Cabasa"
        insty.append(instrument.title())


In [31]:
y, sr = librosa.load(video_path)
sr_resampled = 100
y_resampled = librosa.resample(y, orig_sr=sr, target_sr=sr_resampled)
y_harm, y_perc = librosa.effects.hpss(y_resampled)

  return f(*args, **kwargs)


In [32]:
o_env = librosa.onset.onset_strength(y=y, sr=sr)
times = librosa.times_like(o_env, sr=sr)

In [33]:
sr_resampled = 2000
y_resampled = librosa.resample(y, orig_sr=sr, target_sr=sr_resampled)

In [34]:
data = {"clutter_scalars":clutter_scalars,  "insty":insty, "instx":instx, 
       "camx": camx, "camy":camy, "clutter_frames": list(np.linspace(0, currentframe, len(clutter_scalars)).astype(float)),
       "y_harm":list(y_harm.astype(float)), "y_perc": list(y_perc.astype(float)), "onset":list(o_env.astype(float)), "onset_times":list(times.astype(float)),
       "fps":fps, "y":list(y_resampled.astype(float)) , "sr":sr_resampled}

dumped_json_string = json.dumps(data)
binary_data = ' '.join(format(ord(letter), 'b') for letter in dumped_json_string)
    

In [None]:
import uuid

cs = "dbname=%s user=%s password=%s host=%s port=%s" % ("FilmColors_v2_Production","ERCAdmin","admin","localhost","5433")
conn = psycopg2.connect(cs)
cur = conn.cursor()

cur.execute("INSERT INTO public.db_concert_analyses(video_id, classification_object, analysis_class_name, uuid, dtype, shape, data) VALUES (%s, %s, %s, %s, %s, %s, %s)"
            ,(id, 'Global', "MusicAnalysis",str(uuid.uuid1()), "dict", "",binary_data))
conn.commit()
cur.close()
conn.close()



In [None]:
import sys
sys.path.append('/usr/local/lib/python3.9/site-packages/') # not permanent
import essentia

In [None]:
audio = MonoLoader(filename=video_path, sampleRate=16000)()
model = TensorflowPredictVGGish(graphFilename="audioset-yamnet-1.pb", input="melspectrogram", output="activations")
activations = model(audio)
averaged_predictions = np.mean(activations, axis=0)

In [130]:
a = averaged_predictions.tobytes()

In [131]:
import uuid

cs = "dbname=%s user=%s password=%s host=%s port=%s" % ("FilmColors_v2_Production","ERCAdmin","admin","localhost","5432")
conn = psycopg2.connect(cs)
cur = conn.cursor()

cur.execute("INSERT INTO public.db_concert_analyses(video_id, classification_object, analysis_class_name, uuid, dtype, shape, data) VALUES (%s, %s, %s, %s, %s, %s, %s)"
            ,(id, 'Global', "ClassificationAnalysis",str(uuid.uuid1()), "np.float64", "",a))
conn.commit()
cur.close()
conn.close()



In [None]:
#screenshots
cam = cv2.VideoCapture(video_path)
fps = cam.get(CAP_PROP_FPS)

currentframe = 0

while(True):
    ret,frame = cam.read()
    if ret: 
        if currentframe%25==1:
            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            
            cv2.imwrite('{}/VIAN-Web2/backend/Base/frames/{}/{}.jpg'.format(vian_path, id, currentframe), image) 
            
        currentframe += 1
    else:
        break
cam.release()
cv2.destroyAllWindows()