In [None]:
# Mount Google Drive Folder
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install dependencies
!pip3 install audiosegment

In [None]:
# Define imports
import os
import math
import audiosegment
import numpy as np
import pandas as pd

In [None]:
# Declare constants
sample_rate = 22050
data_directory = "/content/drive/My Drive/Trump/"
db_values = []

In [None]:
# Calculate sound metrics for each wav file in the dataset
for file in sorted(os.listdir(data_directory)):
    filename = data_directory + os.fsdecode(file)
    print("Current Filename: %s" % filename)

    if filename.endswith(".wav"):
        try:
            audio = audiosegment.from_file(filename).resample(sample_rate_Hz=sample_rate)
            wav = audio.to_numpy_array()

            if len(wav.shape) == 2:
                wav = wav.T.flatten()

            if wav.dtype == np.int16:
                wav = wav / 32768.0
            elif wav.dtype == np.int32:
                wav = wav / 2147483648.0
            elif wav.dtype == np.uint8:
                wav = (wav - 128) / 128.0

            wav = wav.astype(np.float32)
            wav = 20 * np.log10(wav)
            wav = np.array([val for val in wav.tolist() if not math.isnan(val) and not math.isinf(val)])

            min = np.min(wav)
            max = np.max(wav)
            mean = np.mean(wav)
            median = np.median(wav)

            db_values.append([min, max, mean, median])
            print('Min: %f; Max: %f; Mean: %f; Median: %f' % (min, max, mean, median))
        except:
            print("Failed to extract sound data/statistics from %s" % filename)
        continue
    else:
        continue

In [None]:
# Save sound metrics to storage
np_db_values = np.array(db_values)
np.save("np_db_values.npy", np_db_values)

In [None]:
# Load sound metrics from storage
# np_db_values = np.load("np_db_values.npy")

In [None]:
# Create a DataFrame
d = {
  'Min': np_db_values[:,0].tolist(),
  'Max': np_db_values[:,1].tolist(),
  'Mean': np_db_values[:,2].tolist(),
  'Median': np_db_values[:,3].tolist()
}
df = pd.DataFrame(d, columns=['Min','Max','Mean', "Median"])

In [None]:
# Inspect new DataFrame
print(df)

In [None]:
# View aggregated sound metrics
print('Overall Min: %f; Overall Max: %f; Overall Mean: %f; Overall Median: %f' % (df["Min"].min(), df["Max"].max(), df["Mean"].max(), df["Median"].max()))