In [1]:
import os
import multiprocessing
import warnings
import numpy as np
from scipy import stats
import pandas as pd
import librosa
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy
nlp = spacy.load('en')
from nltk.corpus import stopwords
from nltk import word_tokenize, ngrams
from sklearn import ensemble
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from nltk import ngrams
import seaborn as sns
#import utils
import librosa
import librosa.display
import utils
import math

In [2]:
# give out the main pitch key
def get_main_key(chroma):
    return np.mean(chroma, axis=1).argmax()


def get_mode(chroma, main_key):
    major_key = (main_key + 4) % 12
    min_key = (main_key  + 3) % 12
    
    majorkey_value = np.mean(chroma[major_key])
    minkey_value = np.mean(chroma[min_key])
    
    return 1 if (majorkey_value > minkey_value) else 0


def get_dissonance(mode, main_key, chroma):
    if mode == 1:
        skip = [0, 2, 2, 1, 2, 2, 2, 1]
    else:
        skip = [0, 2, 1, 2, 2, 1, 2, 2]
    pos = np.cumsum(skip)
    
    chords = [(main_key + offset) % 12 for offset in pos]  
    not_chords = list(set(chords)^set(range(12)))
    
    return np.sum(chroma[not_chords, :])/np.sum(chroma)

def get_diffs (song_file, window_size):
    # number of observations
    num_obs = math.floor(song_file.shape[0]/window_size)
    
    # trimming it to have a whole number of observations
    trimmed = abs(song_file[:window_size*num_obs].reshape(-1, window_size))
    
    # use mean to represent each window of observation
    means = np.mean(trimmed, axis=1)
    norm = np.max(means)
    
    # normalized diffs 
    return abs(np.diff(means))/norm

def get_key_change_frequency(chroma, hop_length, sr):
    num_val_per_sec = sr/hop_length
    num_val_per_5_sec = int(5*num_val_per_sec)
    
    # % of key changes in a 5 second interval
    num_5_sec = math.floor(chroma.shape[1]/num_val_per_5_sec)
    total_key_change = 0

    for i in range(num_5_sec):
        key = get_main_key(chroma[:,num_val_per_5_sec*i:num_val_per_5_sec*(i+1)])
        if key != main_key:
            total_key_change += 1

    return total_key_change/num_5_sec
    

def get_combined_hand_features(loader, sr, filename):
    hop_length = 512
    
    song_file = loader.load(filename)
    chroma = librosa.feature.chroma_stft(y=song_file, sr=sr)
    
    # 50 samples per second
    window_size = int(sr/50)
    num_obs = math.floor(song_file.shape[0]/window_size)
    diffs = get_diffs(song_file, window_size)
    
    main_key = get_main_key(chroma)
    num_features = 12 + 5 + 10
    features = np.zeros(num_features)
    features[main_key] = 1
    
    mode = get_mode(chroma, main_key)
    features[12] = mode
    features[13] = get_dissonance(mode, main_key, chroma)
    
    # max abs diff normalized by the max mean
    features[14] = np.max(diffs)
    
    # % of drops/rise greater than 50% of the max mean
    features[15] = len(np.where(diffs>=0.5)[0])/num_obs
    
    # % of key changes in a 5 second interval
    features[16] = get_key_change_frequency(chroma, hop_length, sr)
    
    # 10 sampled % diff from the song
    num_sampes = 10
    indices = np.linspace(0, len(diffs), num_sampes, endpoint=False).astype(int)
    features[17:]= diffs[indices]
    
    return features


def compute_features_1(tid):
    sr = 22050
    loader = utils.FfmpegLoader(sampling_rate=sr)
    filepath = utils.get_audio_path(os.environ.get('AUDIO_DIR'), tid)
    ft = get_combined_hand_features(loader, sr, filepath)
    return ft

In [3]:
# give out the main pitch key
def get_main_key(chroma):
    return np.mean(chroma, axis=1).argmax()


def get_mode(chroma, main_key):
    major_key = (main_key + 4) % 12
    min_key = (main_key  + 3) % 12
    
    majorkey_value = np.mean(chroma[major_key])
    minkey_value = np.mean(chroma[min_key])
    
    return 1 if (majorkey_value > minkey_value) else 0


def get_dissonance(mode, main_key, chroma):
    if mode == 1:
        skip = [0, 2, 2, 1, 2, 2, 2, 1]
    else:
        skip = [0, 2, 1, 2, 2, 1, 2, 2]
    pos = np.cumsum(skip)
    
    chords = [(main_key + offset) % 12 for offset in pos]  
    not_chords = list(set(chords)^set(range(12)))
    
    return np.sum(chroma[not_chords, :])/np.sum(chroma)

def get_diffs (song_file, window_size):
    # number of observations
    num_obs = math.floor(song_file.shape[0]/window_size)
    
    # trimming it to have a whole number of observations
    trimmed = abs(song_file[:window_size*num_obs].reshape(-1, window_size))
    
    # use mean to represent each window of observation
    means = np.mean(trimmed, axis=1)
    norm = np.max(means)
    
    # normalized diffs 
    return abs(np.diff(means))/norm

def get_key_change_frequency(chroma, hop_length, sr, main_key):
    num_val_per_sec = sr/hop_length
    num_val_per_5_sec = int(5*num_val_per_sec)
    
    # % of key changes in a 5 second interval
    num_5_sec = math.floor(chroma.shape[1]/num_val_per_5_sec)
    total_key_change = 0

    for i in range(num_5_sec):
        key = get_main_key(chroma[:,num_val_per_5_sec*i:num_val_per_5_sec*(i+1)])
        if key != main_key:
            total_key_change += 1

    return total_key_change/num_5_sec
    

def get_combined_hand_features(loader, sr, filename):
    hop_length = 512
    
    song_file = loader.load(filename)
    chroma = librosa.feature.chroma_stft(y=song_file, sr=sr)
    
    # 50 samples per second
    window_size = int(sr/50)
    num_obs = math.floor(song_file.shape[0]/window_size)
    diffs = get_diffs(song_file, window_size)
    
    main_key = get_main_key(chroma)
    num_features = 12 + 5 + 10
    features = np.zeros(num_features)
    features[main_key] = 1
    
    mode = get_mode(chroma, main_key)
    features[12] = mode
    features[13] = get_dissonance(mode, main_key, chroma)
    
    # max abs diff normalized by the max mean
    features[14] = np.max(diffs)
    
    # % of drops/rise greater than 50% of the max mean
    features[15] = len(np.where(diffs>=0.5)[0])/num_obs
    
    # % of key changes in a 5 second interval
    features[16] = get_key_change_frequency(chroma, hop_length, sr, main_key)
    
    # 10 sampled % diff from the song
    num_sampes = 10
    indices = np.linspace(0, len(diffs), num_sampes, endpoint=False).astype(int)
    features[17:]= diffs[indices]
    
    return features

In [4]:
tracks = utils.load('../data/tracks.csv')
subset = tracks.index[tracks['set', 'subset'] <= 'large']
tracks = tracks.loc[subset]

In [5]:
echonest = utils.load('../data/echonest.csv')
join_index = echonest.index & tracks.index
tracks = tracks.loc[join_index]
tracks.shape

(13129, 52)

In [6]:
global_list = []
for tid in tracks.index.values:
    try:
        lst = list(compute_features_1(tid))
        interim_list = [tid] + list(lst)
        global_list += [interim_list]
    except Exception as e:
        print(tid)
        print(e)



11583
Command '['ffmpeg', '-i', '/mnt/hdd1/data_sets/fma_large/011/011583.mp3', '-f', 's16le', '-acodec', 'pcm_s16le', '-ac', '1', '-ar', '22050', '-']' returned non-zero exit status 1




25173
Command '['ffmpeg', '-i', '/mnt/hdd1/data_sets/fma_large/025/025173.mp3', '-f', 's16le', '-acodec', 'pcm_s16le', '-ac', '1', '-ar', '22050', '-']' returned non-zero exit status 1
25174
Command '['ffmpeg', '-i', '/mnt/hdd1/data_sets/fma_large/025/025174.mp3', '-f', 's16le', '-acodec', 'pcm_s16le', '-ac', '1', '-ar', '22050', '-']' returned non-zero exit status 1
25175
Command '['ffmpeg', '-i', '/mnt/hdd1/data_sets/fma_large/025/025175.mp3', '-f', 's16le', '-acodec', 'pcm_s16le', '-ac', '1', '-ar', '22050', '-']' returned non-zero exit status 1
25176
Command '['ffmpeg', '-i', '/mnt/hdd1/data_sets/fma_large/025/025176.mp3', '-f', 's16le', '-acodec', 'pcm_s16le', '-ac', '1', '-ar', '22050', '-']' returned non-zero exit status 1
25180
Command '['ffmpeg', '-i', '/mnt/hdd1/data_sets/fma_large/025/025180.mp3', '-f', 's16le', '-acodec', 'pcm_s16le', '-ac', '1', '-ar', '22050', '-']' returned non-zero exit status 1




In [7]:
new_ft = pd.DataFrame(data=global_list)
print(new_ft.index.size, len(new_ft.columns))
new_ft.head()

13123 28


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.068744,0.028922,0.044352,0.002662,0.154069,0.058273,0.011409,0.14189,0.135761,0.012203
1,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.256889,0.090951,0.037718,0.000656,0.113935,0.079282,0.001438,0.000959,0.003341,0.028672
2,5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.104707,0.008335,0.071369,0.031389,0.068678,0.039291,0.096274,0.078089,0.109636,0.00211
3,10,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.212948,0.006065,0.033545,0.029721,0.041147,0.030455,0.027272,0.015495,0.050853,0.007889
4,134,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.36494,0.010829,0.045546,0.004466,0.157112,0.077691,0.025667,0.060763,0.014685,0.074567


In [8]:
cols= ['track_id', 'C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 
                                      'G#', 'A', 'A#', 'B', 'key', 'dissonance', 'max_diff', 'drop_rise', 'key_change_freq', 'Diff1', 'Diff2', 'Diff3', 'Diff4', 'Diff5', 'Diff6', 'Diff7', 'Diff8', 
                                      'Diff9', 'Diff10']

new_ft = pd.DataFrame(global_list, columns=cols)
new_ft.to_csv('../data/new_feat_large.csv', index = False)

In [None]:
new_ft.head(10)