In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import os
import numpy as np
import pandas as pd
from scipy.io import wavfile
from IPython.display import clear_output
from preprocessing.kernels import DWT, Normalize, Dropna
from preprocessing.data import Dataset

# Defining Speaker information Based on speaker code (found in each filename)
speakers = { 
    '03' : ('male', 31),
    '08' : ('female', 34),
    '09' : ('female', 21),
    '10' : ('male', 32),
    '11' : ('male', 26),
    '12' : ('male', 30),
    '13' : ('female', 32),
    '14' : ('female', 35),
    '15' : ('male', 25),
    '16' : ('female', 31)
}

#Defining label and Emotion string from emotion code (found in each filename)
emotions = {
    'W' : 'anger',
    'L' : 'boredom',
    'E' : 'disgust',
    'A' : 'anxiety/fear',
    'F' : 'happiness',
    'T' : 'sadness',
    'N' : 'neutral',
}


# Checking if dataset.pkl is in directory
if os.path.exists('dataset.pkl'):
    dataset = Dataset.load()
else:
    temporalData = []
    #Reading Data from database and appending to list
    classes = {'speaker':[], 'sex':[], 'age': [], 'text code':[], 'emotion':[]}
    for filename in os.listdir('Berlin Database Of Emotional Speech'):
        # Reading Data from .wav Files
        if filename.endswith('.wav'):
            samplerate, data = \
                wavfile.read('Berlin Database Of Emotional Speech/'+filename)
            temporalData.append(data)
            classes['speaker'].append(filename[:2])
            classes['sex'].append(speakers[filename[:2]][0])
            classes['age'].append(speakers[filename[:2]][1])
            classes['text code'].append(filename[2:5])
            classes['emotion'].append(emotions[filename[5]])
            
    dataset = Dataset(temporalData, 2**14,  
        overlap = 0.6, 
        verbose = True, 
        classes= pd.DataFrame(classes), 
        processors=[
            DWT(DWT.daubechies,14,DWT.features),
            Normalize(),
            Dropna(),
        ])
    dataset.save()

In [6]:
# importing a tree desicion model from scikit learn
from sklearn.tree import DecisionTreeClassifier
from selection import forwardFeatureSelection, plot_loss, early_stopping
X_train, X_test, y_train, y_test = dataset.get_train_test(test_size=0.2, random_state=42)
def func(selection):
    # training tree desicion model on dataset[features] and emotions[labels][emotions]
    model = DecisionTreeClassifier(random_state=0)
    model.fit(X_train[selection], y_train)
    # Calculate loss
    return -model.score(X_test[selection], y_test)

%matplotlib qt
res = forwardFeatureSelection(func, list(dataset.features.columns),
    callbacks = [
        plot_loss(),
        early_stopping(5),
    ])
res

Early stopping, iteration 14, loss did not improve for 5 iterations.


{'selection': ['db5-0-cA-rms',
  'db9-0-cD-rms',
  'db10-1-cD-var',
  'db7-0-cA-mean',
  'db3-0-cA-skew',
  'db4-0-cA-mean',
  'db7-1-cD-kurt',
  'db2-4-cA-kurt',
  'db10-3-cA-rms',
  'db2-6-cD-kurt',
  'db7-1-cD-kurt',
  'db2-4-cA-kurt',
  'db2-5-cA-skew',
  'db2-13-cD-kurt'],
 'value': -0.572265625}

In [5]:
res = {'selection': ['db5-0-cA-rms',
  'db9-0-cD-rms',
  'db10-1-cD-var',
  'db7-0-cA-mean',
  'db3-0-cA-skew',
  'db4-0-cA-mean',
  'db7-1-cD-kurt',
  'db2-4-cA-kurt',
  'db10-3-cA-rms',
  'db2-6-cD-kurt',
  'db7-1-cD-kurt',
  'db2-4-cA-kurt',
  'db2-5-cA-skew',
  'db2-13-cD-kurt'],
 'value': -0.572265625}

In [9]:
X = dataset['features'][res['selection']]
y = dataset['labels']['emotion']

# Alternativa 
# X = dataset['selection']

# Alternativa
#X = dataset.X o dataset.get_x()
#y = dataset.y o dataset.get_y()


# Clustering
# from sklearn.cluster import KMeans

# #Opcion 1: 
# kmeans = KMeans(n_clusters=2, random_state=0).fit(dataset.X)

# # Opcion 3
# dataset.cluster(KMeans(n_clusters=2, random_state=0))
# dataset['clusters']['kmeans']

# Otra clase que maneje la Dataset y los algoritmos que se le apliquen
# class Interactor( (matplotlib figure?) ):

#     def __init__(self, dataset):
#         self.dataset = dataset

#     def cluster(self, algorithm):
#         algorithm.fit(self.dataset.X)
     
#     def selection(self, algorithm):
#         algorithm.select(self.dataset.X, self.dataset.)

#     def dimensionalityReduction(self, algorithm):
#         embedding = algorithm.fit(self.dataset.X)

#     def plot(self, ):
#         self.dataset.plot()



Unnamed: 0,db5-0-cA-rms,db9-0-cD-rms,db10-1-cD-var,db7-0-cA-mean,db3-0-cA-skew,db4-0-cA-mean,db7-1-cD-kurt,db2-4-cA-kurt,db10-3-cA-rms,db2-6-cD-kurt,db7-1-cD-kurt.1,db2-4-cA-kurt.1,db2-5-cA-skew,db2-13-cD-kurt
0,0.253990,0.359717,0.000062,0.718642,0.624191,0.718908,0.107402,0.995290,0.000333,0.500,0.107402,0.995290,0.000756,0.250
1,0.258855,0.318040,0.000656,0.710137,0.625199,0.711435,0.060494,0.995290,0.041291,0.375,0.060494,0.995290,0.000756,0.500
2,0.207510,0.149288,0.001635,0.706324,0.581945,0.708722,0.268954,0.995422,0.112868,0.375,0.268954,0.995422,0.000756,0.375
3,0.062533,0.159337,0.000423,0.720512,0.647452,0.720463,0.440113,0.995365,0.023765,0.750,0.440113,0.995365,0.999990,0.625
4,0.055060,0.128398,0.000500,0.732389,0.617590,0.730060,0.238240,0.995271,0.131828,0.500,0.238240,0.995271,0.000756,0.625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2553,0.219652,0.167864,0.002117,0.667782,0.567498,0.674462,0.257095,0.995440,0.280498,0.875,0.257095,0.995440,0.999990,0.625
2554,0.166899,0.131394,0.000770,0.637738,0.567626,0.637635,0.161516,0.995384,0.011808,0.500,0.161516,0.995384,0.999990,0.625
2555,0.239158,0.156433,0.000832,0.725547,0.566108,0.725095,0.098564,0.995403,0.030715,0.875,0.098564,0.995403,0.000756,0.625
2556,0.273439,0.357633,0.000878,0.700408,0.594745,0.700090,0.287755,0.995384,0.049751,0.500,0.287755,0.995384,0.999990,0.625


In [None]:
# Alternativa mas 


In [None]:
dataset['cluster']