In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import os
import numpy as np
import pandas as pd
from scipy.io import wavfile
from IPython.display import clear_output
from package.kernels import DWT, Normalize, Dropna
from package.data import Dataset

# Defining Speaker information Based on speaker code (found in each filename)
speakers = { 
    '03' : ('male', 31),
    '08' : ('female', 34),
    '09' : ('female', 21),
    '10' : ('male', 32),
    '11' : ('male', 26),
    '12' : ('male', 30),
    '13' : ('female', 32),
    '14' : ('female', 35),
    '15' : ('male', 25),
    '16' : ('female', 31)
}

#Defining label and Emotion string from emotion code (found in each filename)
emotions = {
    'W' : 'anger',
    'L' : 'boredom',
    'E' : 'disgust',
    'A' : 'anxiety/fear',
    'F' : 'happiness',
    'T' : 'sadness',
    'N' : 'neutral',
}


# Checking if dataset.pkl is in directoryd
dataset_path = 'dataset.pkl'
if os.path.exists(dataset_path):
    dataset = Dataset.load(filename=dataset_path)
else:
    temporalData = []
    #Reading Data from database and appending to list
    classes = {'speaker':[], 'sex':[], 'age': [], 'text code':[], 'emotion':[]}
    for filename in os.listdir('Berlin Database Of Emotional Speech'):
        # Reading Data from .wav Files
        if filename.endswith('.wav'):
            samplerate, data = \
                wavfile.read('Berlin Database Of Emotional Speech/'+filename)
            temporalData.append(data)
            classes['speaker'].append(filename[:2])
            classes['sex'].append(speakers[filename[:2]][0])
            classes['age'].append(speakers[filename[:2]][1])
            classes['text code'].append(filename[2:5])
            classes['emotion'].append(emotions[filename[5]])
            
    dataset = Dataset(temporalData, 2**14,  
        overlap = 0.6, 
        verbose = True, 
        classes= pd.DataFrame(classes), 
        processors=[
            DWT(DWT.daubechies,14,DWT.features),
            Normalize(),
            Dropna(),
        ])
    dataset.save(filename=dataset_path)

In [2]:
dataset.features

Unnamed: 0,db1-0-cA-kurt,db1-0-cA-mean,db1-0-cA-rms,db1-0-cA-skew,db1-0-cA-var,db1-0-cD-kurt,db1-0-cD-mean,db1-0-cD-rms,db1-0-cD-skew,db1-0-cD-var,...,db9-9-cA-kurt,db9-9-cA-mean,db9-9-cA-rms,db9-9-cA-skew,db9-9-cA-var,db9-9-cD-kurt,db9-9-cD-mean,db9-9-cD-rms,db9-9-cD-skew,db9-9-cD-var
0,0.016479,0.720750,0.280821,0.659831,0.211679,0.005622,0.521760,0.452244,0.389512,0.232090,...,0.823672,0.554565,0.000615,1.000000e+00,0.000002,0.822498,0.458742,0.001462,2.078593e-11,0.000002
1,0.017818,0.712065,0.269230,0.659172,0.191827,0.007455,0.493495,0.425014,0.406029,0.200358,...,0.827987,0.543129,0.020093,1.000000e+00,0.000785,0.826791,0.469497,0.027996,2.029858e-11,0.000785
2,0.065527,0.710415,0.193790,0.597007,0.062482,0.015730,0.529939,0.236839,0.418597,0.030840,...,0.827588,0.507054,0.085859,1.000000e+00,0.023278,0.826439,0.509434,0.152550,2.033108e-11,0.023278
3,0.031993,0.721588,0.086898,0.701616,0.044626,0.010787,0.536115,0.195562,0.417628,0.061826,...,0.825245,0.560066,0.012004,1.000000e+00,0.004848,0.824042,0.450943,0.069607,2.060900e-11,0.004848
4,0.024528,0.730047,0.073636,0.680571,0.056850,0.012466,0.491011,0.165987,0.425891,0.126065,...,0.827907,0.623461,0.125713,3.303609e-12,0.038309,0.826749,0.392138,0.195706,1.000000e+00,0.038309
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2553,0.027595,0.683111,0.184285,0.570067,0.055844,0.004609,0.471764,0.239413,0.417358,0.030909,...,0.828917,0.391728,0.293681,1.000000e+00,0.094897,0.827820,0.587841,0.308036,2.017321e-11,0.094897
2554,0.063539,0.639738,0.155490,0.582471,0.035313,0.012447,0.622276,0.175116,0.449746,0.026444,...,0.825052,0.556919,0.005449,1.000000e+00,0.000903,0.823881,0.456735,0.030022,2.062906e-11,0.000903
2555,0.051294,0.726250,0.210842,0.567131,0.069403,0.008459,0.474004,0.275370,0.438378,0.019445,...,0.908450,0.571738,0.031521,5.479636e-13,0.000002,0.908627,0.443151,0.001305,1.000000e+00,0.000002
2556,0.023102,0.701643,0.287947,0.617420,0.209112,0.008074,0.492287,0.466945,0.432400,0.068820,...,0.825146,0.567352,0.026104,1.000000e+00,0.017732,0.823959,0.444602,0.133140,2.061803e-11,0.017732


In [4]:
# importing a tree desicion model from scikit learn
from sklearn.tree import DecisionTreeClassifier
from selection import forwardFeatureSelection, plot_loss, early_stopping
X_train, X_test, y_train, y_test = dataset.get_train_test(test_size=0.2, random_state=42)
def func(selection):
    # training tree desicion model on dataset[features] and emotions[labels][emotions]
    model = DecisionTreeClassifier(random_state=0)
    model.fit(X_train[selection], y_train)
    # Calculate loss
    return -model.score(X_test[selection], y_test)

#%matplotlib inline
res = forwardFeatureSelection(func, list(dataset.features.columns),
    callbacks = [
        plot_loss(),
        early_stopping(5),
    ])
res

ModuleNotFoundError: No module named 'sklearn'

Unnamed: 0,anger,anxiety/fear,boredom,disgust,happiness,neutral,sadness
0,0,0,1,0,0,0,0
1,0,0,1,0,0,0,0
2,0,0,1,0,0,0,0
3,1,0,0,0,0,0,0
4,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...
2553,0,0,1,0,0,0,0
2554,0,1,0,0,0,0,0
2555,0,1,0,0,0,0,0
2556,0,1,0,0,0,0,0


In [19]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder().fit(np.array(dataset.labels['emotion']).reshape(-1,1))
enc.categories

'auto'

In [5]:
res = {'selection': ['db5-0-cA-rms',
  'db9-0-cD-rms',
  'db10-1-cD-var',
  'db7-0-cA-mean',
  'db3-0-cA-skew',
  'db4-0-cA-mean',
  'db7-1-cD-kurt',
  'db2-4-cA-kurt',
  'db10-3-cA-rms',
  'db2-6-cD-kurt',
  'db7-1-cD-kurt',
  'db2-4-cA-kurt',
  'db2-5-cA-skew',
  'db2-13-cD-kurt'],
 'value': -0.572265625}

In [9]:
X = dataset['features'][res['selection']]
y = dataset['labels']['emotion']

# Alternativa 
# X = dataset['selection']

# Alternativa
#X = dataset.X o dataset.get_x()
#y = dataset.y o dataset.get_y()


# Clustering
# from sklearn.cluster import KMeans

# #Opcion 1: 
# kmeans = KMeans(n_clusters=2, random_state=0).fit(dataset.X)

# # Opcion 3
# dataset.cluster(KMeans(n_clusters=2, random_state=0))
# dataset['clusters']['kmeans']

# Otra clase que maneje la Dataset y los algoritmos que se le apliquen
# class Interactor( (matplotlib figure?) ):

#     def __init__(self, dataset):
#         self.dataset = dataset

#     def cluster(self, algorithm):
#         algorithm.fit(self.dataset.X)
     
#     def selection(self, algorithm):
#         algorithm.select(self.dataset.X, self.dataset.)

#     def dimensionalityReduction(self, algorithm):
#         embedding = algorithm.fit(self.dataset.X)

#     def plot(self, ):
#         self.dataset.plot()



Unnamed: 0,db5-0-cA-rms,db9-0-cD-rms,db10-1-cD-var,db7-0-cA-mean,db3-0-cA-skew,db4-0-cA-mean,db7-1-cD-kurt,db2-4-cA-kurt,db10-3-cA-rms,db2-6-cD-kurt,db7-1-cD-kurt.1,db2-4-cA-kurt.1,db2-5-cA-skew,db2-13-cD-kurt
0,0.253990,0.359717,0.000062,0.718642,0.624191,0.718908,0.107402,0.995290,0.000333,0.500,0.107402,0.995290,0.000756,0.250
1,0.258855,0.318040,0.000656,0.710137,0.625199,0.711435,0.060494,0.995290,0.041291,0.375,0.060494,0.995290,0.000756,0.500
2,0.207510,0.149288,0.001635,0.706324,0.581945,0.708722,0.268954,0.995422,0.112868,0.375,0.268954,0.995422,0.000756,0.375
3,0.062533,0.159337,0.000423,0.720512,0.647452,0.720463,0.440113,0.995365,0.023765,0.750,0.440113,0.995365,0.999990,0.625
4,0.055060,0.128398,0.000500,0.732389,0.617590,0.730060,0.238240,0.995271,0.131828,0.500,0.238240,0.995271,0.000756,0.625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2553,0.219652,0.167864,0.002117,0.667782,0.567498,0.674462,0.257095,0.995440,0.280498,0.875,0.257095,0.995440,0.999990,0.625
2554,0.166899,0.131394,0.000770,0.637738,0.567626,0.637635,0.161516,0.995384,0.011808,0.500,0.161516,0.995384,0.999990,0.625
2555,0.239158,0.156433,0.000832,0.725547,0.566108,0.725095,0.098564,0.995403,0.030715,0.875,0.098564,0.995403,0.000756,0.625
2556,0.273439,0.357633,0.000878,0.700408,0.594745,0.700090,0.287755,0.995384,0.049751,0.500,0.287755,0.995384,0.999990,0.625


In [None]:
# Alternativa mas 


In [None]:
dataset['cluster']