In [90]:
%matplotlib inline
from scipy.io import wavfile
from scipy import signal
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn import preprocessing

# Pre processing songs with Power Spectral Density

In [459]:
def processSong(path, generic=None, file=False, limit=True):
    fs, data = wavfile.read(path)
    window = 2**16
    sig = data[:,0]
    window = min(window, len(sig))
    
    f, t, signal_freq = signal.spectrogram(sig, fs=fs, nperseg=window, nfft=window)
    output = []
    # Filtering
    for i in range(signal_freq.shape[1]):
        signal_chunk_freq = signal_freq[:,i]
        signal_chunk_freq = voiceFilter(signal_chunk_freq, fs, window)/window
        if np.mean(signal_chunk_freq) < 0.15 and limit == True:
            continue
        ## voice frequencies are active in this segment
        output.append(signal_chunk_freq)
    output = np.array(output)
    if file == True:
        np.save("mats/" + generic, output)
    else:
        return output
def voiceFilter(signal_freq, fs, window, minimum=50, maximum=1000):
    min_bin = np.round(freqToBin(minimum, fs, window))
    max_bin = np.round(freqToBin(maximum, fs, window))
    return signal_freq[min_bin:max_bin]
def binToFreq(bin_num, fs, window):
    return fs/window * bin_num
def freqToBin(freq, fs, window):
    return freq / (fs/window)

In [460]:
directory = "songs"
for filename in os.listdir(directory):
    if filename.endswith(".wav"):
        path = os.path.join(directory, filename)
        processSong(path, generic=filename, file=True)



In [461]:
categories = ["Beyonce", "Drake", "Coldplay"]

In [475]:
directory = "mats"
data = []
labels = []
for filename in os.listdir(directory):
    if filename.endswith(".npy"):
        path = os.path.join(directory, filename)
        mat = np.load(path)
        label = -1
        for i in range(len(categories)):
            category = categories[i]
            if filename.startswith(category):
                label = i           
        for point in mat:
            data.append(point)
            labels.append(label)
data = np.array(data)
labels = np.array(labels)

# PCA dimensionality reduction

In [476]:
print(data.shape)

(1223, 1412)


In [477]:
data = preprocessing.normalize(data)
pca = PCA(n_components=200)
pca.fit(data)
data = pca.transform(data)
#scaler = preprocessing.StandardScaler().fit(data)
#data = scaler.transform(data)

In [478]:
print(data.shape)

(1223, 200)


# Decision Tree/Random Forest/Adaboost

In [479]:
random = np.arange(len(data))
np.random.shuffle(random)
training_data = data[random]
training_labels = labels[random]

flip = 0.8 * len(training_data)
validation_data = training_data[flip:]
validation_labels = training_labels[flip:]

training_data = training_data[:flip]
training_labels = training_labels[:flip]



In [480]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

In [481]:
clf = MLPClassifier(hidden_layer_sizes=(400,200), alpha=0.001)
clf.fit(training_data, training_labels)

MLPClassifier(activation='relu', alpha=0.001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(400, 200), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

# Test

## Validation data

In [482]:
predicted_labels = clf.predict(validation_data)
correct = 0
for real, predicted in zip(validation_labels, predicted_labels):
    if np.array_equal(real, predicted):
        correct += 1
print(correct/len(predicted_labels))

0.9836734693877551


## Entire songs

In [483]:
from collections import defaultdict

In [500]:
def predict(song, pca, clf, cats, scaler=None, limit=True):
    song = processSong(song, limit=limit)
    song = preprocessing.normalize(song)
    song_data = pca.transform(song)
    if scaler != None:
        song_data = scaler.transform(song_data)
    predictions = clf.predict(song_data)
    print(predictions)
    counts = defaultdict(int)
    for prediction in predictions:
        counts[cats[prediction]] += 1
    return counts

Drake - Hold on, we're going home

In [489]:
drake = predict("test/drake-hold-on-we27re-going-home.wav", pca, clf, categories)
print(drake)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 0 1 1 1 1 1 1 0 1 1
 1 1 1 0 1 0 0 0 1 0 0 1 0 0 0 0 0 2 2 0 1 0 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1
 1 1 1 1 1 0 1 1 1 1 2 1 0 1 0 0 0 0 0 0 0 0 0 1 0 2 0 1 0 2 2 0 1 1 0 0 1
 0 0 1 2 0 0 2 2 2 1 1 1 0 0 0 0 1 2 0 0 0 2 2 2 1 1 1 0 0 0 0 0 2 0 0 0 1
 1 1 1 1 0 1 1 1 0 1 0 1 0 1]
defaultdict(<class 'int'>, {'Drake': 87, 'Coldplay': 15, 'Beyonce': 60})




In [490]:
beyonce = predict("test/beyonce-partition.wav", pca, clf, categories)
print(beyonce)

[0 0 0 0 0 0 0 0 0 0 0 2 0 0 2 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 1
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 1 1 1 1 0 1 0
 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0]
defaultdict(<class 'int'>, {'Drake': 22, 'Coldplay': 2, 'Beyonce': 104})




In [491]:
coldplay = predict("test/coldplay-yellow.wav", pca, clf, categories)
print(coldplay)

[1 1 0 1 0 1 1 1 2 0 2 2 0 1 0 2 0 0 1 1 1 1 1 1 1 2 2 2 2 0 2 0 2 1 1 0 2
 2 2 2 2 2 2 2 2 2 1 1 0 1 1 2 2 2 0 1 2 0 2 0 0 1 1 0 2 1 2 2 2 1 2 2 2 0
 2 2 2 2 2 2 1 1 0 1 1 2 2 1 2 0 2 1 2 1 1 0 1 1 1 1 1 2 2 2 2 2 2 0 0 0 0
 0 0 0 2 2 0 2 2 2 2 2 0 0 1 1 0 2 2 1 2 0 2 1 0 2 2 1 0 2 2 0 2 0 1 1 1 2
 0 2 0 2 2 1 1 0 0 1 1 1 1 1 1 0 2 2 0 2 2 2 0 2 2 0 1 1 0 2 2 2 2 2 1 0 0
 0 1 1 0 1 0 0 0 2 2]
defaultdict(<class 'int'>, {'Drake': 60, 'Coldplay': 83, 'Beyonce': 52})




# Voice password

In [239]:
from subprocess import call

In [256]:
directory = "testingdata"
output_dir = "speech"
for filename in os.listdir(directory)[:500]:
    if filename.endswith(".mp3"):
        path = os.path.join(directory, filename)
        output_path = os.path.join(output_dir, filename[:-4] + ".wav")
        call(["mpg123","-w",output_path,path])

In [509]:
directory = "speech"
speech_data = []
speech_labels = []
for filename in os.listdir(directory)[:500]:
    if filename.endswith(".wav"):
        path = os.path.join(directory, filename)
        song = processSong(path, generic=filename, limit=False)
        for point in song:
            speech_data.append(point)
            speech_labels.append(0)
me = "me"
for filename in os.listdir(me):
    if filename.endswith(".wav"):
        path = os.path.join(me, filename)
        song = processSong(path, generic=filename, limit=False)
        for point in song:
            for i in range(15):
                speech_data.append(point)
                speech_labels.append(1)
speech_data = np.array(speech_data)
speech_labels = np.array(speech_labels)
print(speech_data.shape)

(5488, 1412)




In [510]:
speech_data = preprocessing.normalize(speech_data)
speech_pca = PCA(n_components=200)
speech_pca.fit(speech_data)
speech_data = speech_pca.transform(speech_data)

In [511]:
speech_random = np.arange(len(speech_data))
np.random.shuffle(speech_random)
speech_training_data = speech_data[speech_random]
speech_training_labels = speech_labels[speech_random]

speech_flip = 0.8 * len(speech_training_data)
speech_validation_data = speech_training_data[speech_flip:]
speech_validation_labels = speech_training_labels[speech_flip:]

speech_training_data = speech_training_data[:speech_flip]
speech_training_labels = speech_training_labels[:speech_flip]



In [512]:
speech_clf = MLPClassifier(hidden_layer_sizes=(400,200), alpha=0.001)
speech_clf.fit(speech_training_data, speech_training_labels)

MLPClassifier(activation='relu', alpha=0.001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(400, 200), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [513]:
speech_predicted_labels = speech_clf.predict(speech_validation_data)
correct = 0
for real, predicted in zip(speech_validation_labels, speech_predicted_labels):
    if np.array_equal(real, predicted):
        correct += 1
print(correct/len(speech_predicted_labels))

0.9981785063752276


In [514]:
speech_cats = ["not Dorian", "Dorian"]
predict("test/dorian.wav", speech_pca, speech_clf, speech_cats, limit=False)

[1 1 0 1 1 0]




defaultdict(<class 'int'>, {'Dorian': 4, 'not Dorian': 2})