## Env set up:

#### conda create --name audio-sentiment python=3.8
#### conda activate audio-sentiment 
#### conda install -c conda-forge librosa==0.9.2
#### jupyter notebook

## Methodology approach:

#### 1. isolates audio signal's top peaks with the highest duration by estimateting signal's short-term energy
#### 2. controls the distance and adjust the order between consecutive 'highest-duration' peaks
#### 3. in case the distance is lower than specified minimum, two consecutive peaks are joined into single one

## Main Params:

#### - max_num_peaks - controls number of clips to be returned
#### - min_peak_len - minimum peak duration in seconds
#### - min_peak_dist - minimum distance between peaks in seconds
#### - clip_len - duration of clips to be returned in seconds

In [1]:
import sys
import glob
import librosa
import datetime
import warnings
import numpy as np
import pandas as pd


warnings.filterwarnings('ignore')
                        
                        
class AudioSentiment:

    def __init__(self, bucket, recording, extension, max_num_peaks, min_peak_len, min_peak_dist, clip_len):

        self.bucket = str(bucket)
        self.recording_id = str(recording)
        self.extension = str(extension)
        self.max_num_of_reactions = int(max_num_peaks)
        self.min_reaction_length = int(min_peak_len)
        self.min_reaction_distance = int(min_peak_dist)
        self.clip_length = clip_len
        self.start_timestamp = None
        self.uuid_min = 1000

    def generate_timelines(self):

        direct_path = self.bucket + self.recording_id
        paths = glob.glob(direct_path + '*' + self.extension)
        composite_path = [item for item in paths if "Composite" in item]
        paths = [item for item in paths if "Composite" not in item]
        paths = [path[len(direct_path):] for path in paths]
        actions = [string.split('_')[0] for string in paths]
        timestamps = [string.split('_')[1][:-len(self.extension)] for string in paths]
        timestamps = self.timestamps_conversion(timestamps=timestamps)

        timeline = pd.DataFrame(
            data={"timestamp": timestamps,
                  "action_id": actions,
                  "file_path": paths}).sort_values(by=["timestamp"]).reset_index(drop=True)

        self.start_timestamp = timeline["timestamp"].min()
        speakers = timeline[timeline["action_id"].astype(int) >= self.uuid_min].reset_index(drop=True)
        reactions = timeline[timeline["action_id"].astype(int) <= self.uuid_min].reset_index(drop=True)

        return speakers, reactions, composite_path

    def batch_audios_peaks(self, timeline_df):

        direct_path = self.bucket + self.recording_id

        timeline = timeline_df.copy()
        timeline[['length', 'cum_kdb', 'max_db']] = np.nan
        timeline[[f"peak_{i + 1}_start" for i in range(self.max_num_of_reactions)]] = np.nan

        for i in range(len(timeline)):

            try:
                audio_path = str(direct_path + str(timeline['file_path'][i]))
                audio_file, sr = librosa.load(path=audio_path)

                timeline['length'][i] = librosa.get_duration(audio_file)

                power = np.abs(librosa.stft(audio_file))
                timeline['cum_kdb'][i] = librosa.power_to_db(power ** 2, ref=np.median).sum() / 1000
                timeline['cum_kdb'][i] = "{:.8f}".format(float(timeline['cum_kdb'][i]))
                timeline['max_db'][i] = librosa.power_to_db(power ** 2, ref=np.median).max()
                timeline['max_db'][i] = "{:.8f}".format(float(timeline['max_db'][i]))
                peaks = self.calculate_peaks(audio_file, sr=sr)

                for j in range(self.max_num_of_reactions):
                    timeline[f'peak_{j + 1}_start'][i] = self.recursive_peaks_isolation(peaks)[j][0]

            except IndexError:
                pass

        return timeline

    def single_audio_peaks(self, file_name):

        audio_path = self.bucket + self.recording_id + file_name
        audio_file, sr = librosa.load(path=audio_path)
        peaks = self.calculate_peaks(audio_file, sr)

        peak_start = []
        peak_len = []
        for i in range(self.max_num_of_reactions):
            isolated_peaks = self.recursive_peaks_isolation(peaks)[i]
            peak_start.append(isolated_peaks[0])
            peak_len.append(len(isolated_peaks))

        result = pd.DataFrame(data={"peak_start": peak_start, "peak_length": peak_len})
        result["peak_end"] = result["peak_start"] + result["peak_length"]
        result["start_in_minutes"] = pd.Series(result["peak_start"] / 60
                                               ).apply(lambda x: str(datetime.timedelta(minutes=x)))
        result["end_in_minutes"] = result["start_in_minutes"] + datetime.timedelta(seconds=self.clip_length)
        result["end_in_minutes"] = [str(result["end_in_minutes"][i])[-8:]
                                    for i in range(len(result["end_in_minutes"]))]

        return result

    def recursive_peaks_isolation(self, sequence):

        sequence = self.unique_seconds(sequence)

        top_reactions = []
        num_of_sequences = 0

        for i in np.arange(self.max_num_of_reactions):

            if len(sequence) >= num_of_sequences:
                reaction_subseq = self.longest_increasing_subsequence(sequence)
                top_reactions.append(reaction_subseq)
                sequence = [x for x in sequence if x not in reaction_subseq]
                num_of_sequences = num_of_sequences + 1

            else:
                continue

        top_reactions = [x for x in top_reactions if len(x) >= self.min_reaction_length]

        for i in range(len(top_reactions) - 1):

            if len(top_reactions) > 1:

                if top_reactions[i][-1] < top_reactions[i + 1][0]:
                    distance = min(top_reactions[i + 1]) - max(top_reactions[i])

                else:
                    distance = max(top_reactions[i]) - min(top_reactions[i + 1])

                if distance < self.min_reaction_distance:
                    top_reactions[i] = top_reactions[i] + top_reactions[i + 1]
            else:
                continue

        return top_reactions

    def longest_increasing_subsequence(self, sequence):

        sequence = self.unique_seconds(sequence)

        n = len(sequence)
        indicies = {i: 0 for i in range(n)}
        seq_len = [0 for i in range(n)]
        maximum = -sys.maxsize - 1

        index = -1
        for elem in range(n):

            if (sequence[elem] - 1) in indicies:

                last_index = indicies[sequence[elem] - 1] - 1

                seq_len[elem] = 1 + seq_len[last_index]
            else:
                seq_len[elem] = 1

            indicies[sequence[elem]] = elem + 1

            if maximum < seq_len[elem]:
                maximum = seq_len[elem]
                index = elem

        longest_subsequence = []

        for current in range(sequence[index] - maximum + 1, sequence[index] + 1, 1):
            longest_subsequence.append(current)

        return longest_subsequence

    @staticmethod
    def timestamps_conversion(timestamps):

        for i in range(len(timestamps)):
            timestamps[i] = timestamps[i][:4] + '-' + timestamps[i][4:]
            timestamps[i] = timestamps[i][:7] + '-' + timestamps[i][7:]
            timestamps[i] = timestamps[i][:10] + ' ' + timestamps[i][10:]
            timestamps[i] = timestamps[i][:13] + ':' + timestamps[i][13:]
            timestamps[i] = timestamps[i][:16] + ':' + timestamps[i][16:]
            timestamps[i] = timestamps[i][:19] + '.' + timestamps[i][19:]
            timestamps[i] = pd.to_datetime(timestamps[i])

        return timestamps

    @staticmethod
    def unique_seconds(seconds_sequence):

        seconds_sequence = np.floor(seconds_sequence)
        seconds_sequence = np.unique(seconds_sequence)
        seconds_sequence = [int(i) for i in seconds_sequence]

        return seconds_sequence

    @staticmethod
    def calculate_peaks(audio_file, sr):
        return librosa.onset.onset_detect(audio_file, sr=sr, units='time')


In [5]:
# Inputs
bucket_name = '/path/to/recordings/folder'
recording_id = '/recording_id/' 
ext = '.aac'
clip_len = 45
max_num_of_peaks = int(10)
min_peak_lenght = int(5)  
min_peak_distance = int(10)

In [6]:
# Init sentiment class
audio_analysis = AudioSentiment(bucket = bucket_name,
                                recording=recording_id, 
                                extension=ext, 
                                max_num_peaks=max_num_of_peaks, 
                                min_peak_len=min_peak_lenght, 
                                min_peak_dist=min_peak_distance, 
                                clip_len = clip_len)

In [8]:
# Get the proposed clip cuts, based on top peaks in "CompositeRecording.acc" for specified bucket and recording
composite_peaks = audio_analysis.single_audio_peaks("GstVideo.mp4")
composite_peaks

Unnamed: 0,peak_start,peak_length,peak_end,start_in_minutes,end_in_minutes
0,213,176,389,0:03:33,00:04:18
1,1032,174,1206,0:17:12,00:17:57
2,783,154,937,0:13:03,00:13:48
3,1363,139,1502,0:22:43,00:23:28
4,658,124,782,0:10:58,00:11:43
5,1261,100,1361,0:21:01,00:21:46
6,561,90,651,0:09:21,00:10:06
7,1503,89,1592,0:25:03,00:25:48
8,399,82,481,0:06:39,00:07:24
9,133,79,212,0:02:13,00:02:58


In [6]:
# Generate timelines on speaker's and effect's level
timelines = audio_analysis.generate_timelines()

# Extract timelines:
speakers_timeline = timelines[0]
effects_timeline = timelines[1]

# Extract sentiments:
sentiment_speakers = audio_analysis.batch_audios_peaks(speakers_timeline)
sentiment_effects = audio_analysis.batch_audios_peaks(effects_timeline)

In [7]:
speakers_timeline

Unnamed: 0,timestamp,action_id,file_path
0,2022-08-25 13:23:43.896,19297,19297_20220825132343896.aac
1,2022-08-25 13:27:22.677,3368,3368_20220825132722677.aac
2,2022-08-25 13:29:15.666,3421,3421_20220825132915666.aac
3,2022-08-25 13:32:43.566,19297,19297_20220825133243566.aac
4,2022-08-25 13:53:55.426,3500,3500_20220825135355426.aac
5,2022-08-25 13:54:13.656,4175,4175_20220825135413656.aac
6,2022-08-25 13:54:50.147,3500,3500_20220825135450147.aac
7,2022-08-25 14:02:53.196,4175,4175_20220825140253196.aac
8,2022-08-25 14:04:39.746,4175,4175_20220825140439746.aac
9,2022-08-25 14:05:36.376,4175,4175_20220825140536376.aac


In [8]:
sentiment_speakers

Unnamed: 0,timestamp,action_id,file_path,length,cum_kdb,max_db,peak_1_start,peak_2_start,peak_3_start,peak_4_start,peak_5_start,peak_6_start,peak_7_start,peak_8_start,peak_9_start,peak_10_start
0,2022-08-25 13:23:43.896,19297,19297_20220825132343896.aac,16.0,26477.262,115.01680756,,,,,,,,,,
1,2022-08-25 13:27:22.677,3368,3368_20220825132722677.aac,54.314694,25383.736,79.77148438,0.0,,,,,,,,,
2,2022-08-25 13:29:15.666,3421,3421_20220825132915666.aac,20.522676,55926.712,137.09065247,0.0,,,,,,,,,
3,2022-08-25 13:32:43.566,19297,19297_20220825133243566.aac,2468.43737,467685.344,72.0160675,445.0,1591.0,219.0,2266.0,801.0,1832.0,1380.0,672.0,2136.0,1249.0
4,2022-08-25 13:53:55.426,3500,3500_20220825135355426.aac,16.938685,2340.4915,64.46118927,,,,,,,,,,
5,2022-08-25 13:54:13.656,4175,4175_20220825135413656.aac,18.581361,4747.404,72.07978821,,,,,,,,,,
6,2022-08-25 13:54:50.147,3500,3500_20220825135450147.aac,69.71737,116022.296,112.27416229,34.0,12.0,,,,,,,,
7,2022-08-25 14:02:53.196,4175,4175_20220825140253196.aac,40.832018,57375.3,106.73985291,2.0,,,,,,,,,
8,2022-08-25 14:04:39.746,4175,4175_20220825140439746.aac,17.024036,17109.624,101.45874786,,,,,,,,,,
9,2022-08-25 14:05:36.376,4175,4175_20220825140536376.aac,22.229342,9348.228,69.19724274,,,,,,,,,,


In [9]:
effects_timeline

Unnamed: 0,timestamp,action_id,file_path
0,2022-08-25 13:21:33.966,22,22_20220825132133966.aac
1,2022-08-25 13:21:34.296,2,2_20220825132134296.aac


In [10]:
sentiment_effects

Unnamed: 0,timestamp,action_id,file_path,length,cum_kdb,max_db,peak_1_start,peak_2_start,peak_3_start,peak_4_start,peak_5_start,peak_6_start,peak_7_start,peak_8_start,peak_9_start,peak_10_start
0,2022-08-25 13:21:33.966,22,22_20220825132133966.aac,3138.026667,7949531.136,135.55253601,1645.0,1067.0,712.0,1624.0,891.0,3118.0,1398.0,2020.0,1241.0,362.0
1,2022-08-25 13:21:34.296,2,2_20220825132134296.aac,181.738685,153895.504,93.62083435,26.0,109.0,1.0,138.0,161.0,,,,,
