In [1]:
#Mapping pitch to frequency
#midi notes are as follows:
#A0 = 21, A1 = 33
#C8 = 108, A4 = 69
#From 21 to 108, in intervals of 12




#12 Notes in an octave
#Starts at C
#C, C#, D, D#, F, F#, E, G, G#, A, A#, B

def pitch2freq(p):
    return (2**((p-69)/12))*440 #440 is standard A4, and is midi number 69

#Frequency of middle C, C4 or midi number 60, is ~261 Hz

print(pitch2freq(60))

#Octaves are also divided into cents, where the pitch between semitones is divdided by 100

#1200 cents in an octave, a noticeable difference in pitch is about 10-25 cents

#Humans percieve sounds at very low intensities, the threshold of hearing is about 10^-12 W/m^2
toh = 10e-12
#Threshold of pain is 10 W/m^2

#Since the human perception of sound covers 13 orders of magnitude, we use a log scale for intensity (decibels)
from math import log10
def intensity2dB(I):
    return 10 * log10(I/toh)

#If I is = toh, then we return 0 decibles
#Everytime we go up by ~3db, intensity doubles

#If frequency is the objective measure of pitch (subjective)
#Then intensity is the objective measure of loudness (also subjective)

#Humans percieve lower frequency sounds as quieter than higher freqency sounds, even if they have the same intensity.






261.6255653005986


In [2]:
#TIMBRE

#Timbre is the difference between two sounds that have the same intensity, frequency, and duration

#For instance middle C played on a trumpet vs a piano

#Timbre is multidimensional, meaning it has multiple features that all contribute to it
#Sound envelope, harmonic content, and amplitude/frequency modulation

#Sound evelope refers to the ADSR of the note

#Attack (The amount of time it takes for the note to start and reach its highest ampliutude)
#Decay (The amount of time it takes for the note to reach the highest amp to the sustain level)
#Sustain (The amount of time the note remains at the same volume)
#Release (The amount of time it takes for the note to fade)



#The harmonic content refers to how intensity is split up among the different partial frequencies of a note, this gives the note it's color
#The fundamental freq of the note has the highest intensity, with multiples of it layered over it
#The distribution of intensity over those multiples is the harmonic content


#Pitch and amp modulation

#Notes can also change in frequency and intensity over time
#This too contributes to the timbre of the sound

#Notes that change in pitch have vibrato
#Notes that change in amplitude have tremelo



1e-11


In [3]:
#ADC and DAC

#ADC stands for analog to digital conversion

#Since sound is composed of mechanical waves, the amplitude and time scales are continuous

#You would need an infinite amount of memory to digitally store a sound

#Have to convert to discrete values

#We use sampling and quantization to do that

#Sampling:
    #Pretty self explanatory, the sound is sampled at various points on the time scale
    #Sampling rate refers to how often the amplitude is recorded on the timescale
    #The nyquist frequency is half of the sampling rate
    #If there are frequencies above the nyquist frequency, then they will be aliased (moved to a lower frequency) when sampled
    #For that reason, most sampling rates are around 40000 Hz, since the human hearing range caps out at 20k, all frequencies are below the nyquist frequency and therefore not aliased
    

#Quantization
    #Pretty similar to sampling, except for the amplitude scale instead of the time scale
    #Amplitude is 'sampled' onto an axis where each 'tick' or space is a binary value
    #Resolution = number of bits
    #CD resolution is 16 bits, meaning ampltiude is tracked over (2^16) 65536 possible values.
    

In [4]:
#Audio feature categorization

#Level of abstraction
    #Low level features: amplitude envelope, energy, spectral centriod, spectral flux, zero-crossing rate
    #Mid level features: pitch and beat related descriptors, fluctuation patterns, MFCCs, note onsets (when they are struck)
    #High level features: instrumentation, key, chords, melody, rhythm, tempo, lyrics, genre, mood
#Temporal scope
    #Instantenous (~50ms)
    #Segment-level (seconds)
    #Global (entire song)
#Signal domain
    #Some features are in the time domain only: amp env, zero-crossing rate, and RMSE. They are extracted from the raw waveform
    #Others live in the frequency domain: Band energy ratio, spectral centriod, spectral flux (usually use Fourier Transform to analyse these)
    #The final features are time-freq domain: spectrogram, MFCCs, and constant-q transform
#ML approach
    #Traditional ML techniques: SVMs, logistic and linear regression
        #With these technqiues you hand pick certain audio features you feel would be important to solving some sort of audio classification task, and feed into your svm, or regression
    #Deep learning techniques:
        #With DL technqiues, you usually send the whole raw audio, and a spectrogram. And then the features are extracted from there automatically and fed into the algorithm
        
#Audio feature pipeline
    #Time domain features:
        #Natural audio is fed through an ADC, sampled and quantized, each sample is (1/44100) =~ .027 ms
        #Samples are 'framed' or put into percieveable chunks of audio
        #Ear resolution is 10ms
        #Frames are always of size 2^x in sample length, because having frames that are log2 speeds up the fast fourier transform
        #Typical values 256-8192 samples
        #Time domain features are then calculated on the frames
        #And then aggregated together (mean, median, gaussian mixture models)
        #Then we have a final feature value, vector, or matrix
    #Frequency domain features:
        #Natural audio is fed through an ADC, and framed again
        #The frames are then moved from the time domain to the frequency domain using the fourier transform
        #BUT! We have to worry about spectral leakage
        #When the frame contains parts of a signal that do not conver it's entire period, the endpoints are discontinuous
        #This causes artifacts (higher frequencies) to show up in the frequency graphs
        #For this reason frames are 'windowed' using a windowing function to preserve the mid-section of a sample
        #While silencing the endpoints
        #This is also why the frames overlap, so that the entire audio signal is still processed
        #'hop' length refers to the amount of shift to the right in samples in the frame
        #Once the singal is framed and windowed we pass it into the fourier transform
        #And extract our low level frequency domain features, and aggreate into a feature value, vector, or matrix
        
        



In [None]:
#Time domain features in depth

#Ampltiude envelope
    #The maximum amplitude value of all samples in a frame
    #Gives rough idea of loudness
    #Sensitive to outliers
    #Very useful to onset detection, finding the point in the signal where a note is struck
    #Some higher level abstractions as well like genre classification (some genres are louder than others, folk vs metal)
#Root mean square energy
    #This is the root mean square of all the samples in a frame
    #Also an indicator of loudness
    #Less sensitive to outliers
    #Very useful for audio segmentation (consolidating up instantenous features into second-long segments) as RSME changes a lot when a new musical event happens
#Zero crossing rate
    #Number of times a signal crosses the horizontal axis
    #Recongition of percussive vs pitched sounds
    #Percussive noises tend to have random ZCRs while pitched noises tend to have much more stable ZCRs

