In [56]:
import pandas as pd
import numpy as np
from plotly.offline import iplot
import plotly.express as px
import cufflinks as cf # for creating plots from pandas on the fly
import plotly.io as IO
cf.go_offline()
cf.set_config_file(theme='ggplot') #{'solar', 'pearl', 'white', 'ggplot'}
import plotly.figure_factory as ff

import plotly.express as px
from plotly.offline import iplot

import plotly.graph_objects as go

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn import svm
from sklearn import ensemble
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score
from sklearn.ensemble import RandomForestClassifier

from statsmodels.stats import weightstats

#### Dances to classify

In [57]:
classif_dances = ['walzer', 'deutscher', 'ländler', 'menuett', 'trio']
color_dances = {'walzer':'red','deutscher': 'orange','ländler': 'yellow','menuett': 'blue', 'trio': 'lightblue'}

In [58]:
rhythmic_patterns_all = pd.read_csv('data/tsv/os_patterns.tsv', sep = '\t', index_col = 0).fillna(0)
rhythmic_patterns_all.drop('no', axis = 1, inplace = True)
rhythmic_patterns_all.rename(lambda x: x+'_all', axis = 'columns', inplace = True)
both_hands = rhythmic_patterns_all.columns.to_list()

In [59]:
rhythmic_patterns_left = pd.read_csv('data/tsv/os_patterns_left.tsv', sep = '\t', index_col = 0).fillna(0)
rhythmic_patterns_left.drop('no', axis = 1, inplace = True)
rhythmic_patterns_left.rename(lambda x: x+'_left', axis = 'columns', inplace = True)
left_hand = rhythmic_patterns_left.columns.to_list()

In [60]:
rhythmic_patterns_right = pd.read_csv('data/tsv/os_patterns_right.tsv',sep = '\t', index_col = 0).fillna(0)
rhythmic_patterns_right.drop('no', axis = 1, inplace = True)
rhythmic_patterns_right.rename(lambda x: x+'_right', axis = 'columns', inplace = True)
right_hand = rhythmic_patterns_right.columns.to_list()

In [61]:
files = pd.read_csv('data/tsv/files.tsv', sep='\t')

In [62]:
files.head()

Unnamed: 0,id,D,no,dance,path,gt_mode,entropy,duration_entropy,onset_entropy,num_keys,...,TitiTitiTiti_right,TitiTitino_right,TitigiTa_right,TitigiTi_right,TitigiTigitigi_right,TitigiTiti_right,TitigiTitigi_right,Triole_right,nogiTimgi_right,nogiTiti_right
0,1,41,1,menuett,041/D041menuett01a.mscx,major,0.456823,1.547558,1.693202,3,...,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,41,1,trio,041/D041trio01b.mscx,major,0.578475,1.18684,2.01186,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,41,2,menuett,041/D041menuett02a.mscx,major,0.564445,1.41572,1.521429,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,41,2,trio,041/D041trio02b.mscx,major,0.6127,0.947097,2.14705,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,41,3,menuett,041/D041menuett03a.mscx,major,0.588911,1.508821,1.763593,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Select the features

In [63]:
features = files.dropna()

Galopp and Cotillon appear only few times -> drop.
Ecossaise is easy to identify by meter -> drop

In [64]:
indexNames = features[(features['dance'] == 'galopp') | (features['dance'] == 'cotillon') | (features['dance'] == 'ecossaise')].index
features.drop(indexNames , inplace=True)

Add a numerical label because it is needed for the classification

In [149]:
dance_dict = {'deutscher':0, 'ländler':2, 'menuett':3, 'trio':4, 'walzer':5}

def label_dance(row):
    return dance_dict[row]

features['dance_num_label'] = features['dance'].apply (lambda row: label_dance(row))

Visualize the discriminativity of the features

In [150]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from matplotlib import colors

def tSNE(feature_matrix, number_of_dances=8):
    tsne_features = TSNE(n_components=2).fit_transform(feature_matrix)    
    features['TSNE1'] = tsne_features[:, 0]
    features['TSNE2'] = tsne_features[:, 1]

Make features a matrix. Save labels as an array.

In [151]:
selected_features = ['ratio_downbeat_non_downbeat', 'ratio_downbeat_non_downbeat_onset', 
                     'ratio_downbeat_non_downbeat_strictly','num_keys','event_density',
                     'sixths_count', 'thirds_count',
                     'downbeat_dur', 'entropy', 'maj-min-first',
                     'duration_entropy', 'onset_entropy', 
                     'interval_downbeat_offbeat', 'eigth_notes_frac',
                     'maj_min_first_abs', 'start_end_key', 'onset_density'] + right_hand + left_hand


In [169]:
feature_matrix = features[selected_features].to_numpy()
#feature_matrix = features.drop(columns=['dance', 'dance_num_label', 'path', 'gt_mode', 'id', 'D', 'no']).to_numpy()
labels = features[['dance_num_label']].to_numpy()
labels = np.ravel(labels)

Standardize the features

In [178]:
scaler = StandardScaler()
scaler.fit(feature_matrix)
feature_matrix_scaled = scaler.transform(feature_matrix)

Implementing Stratified K-fold cross-validation, to maintain uniform distribution of classes across the folds. 

In [186]:
n_splits = 10

In [187]:
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state = 42)

In [188]:
n_classes = len(np.unique(labels))
global_confusion = np.zeros((n_classes, n_classes))
kappa_global = []
feature_importance = np.zeros(len(selected_features))
feature_importance_errors = []

for train_index, test_index in kf.split(feature_matrix_scaled, labels):
    features_train, features_test = feature_matrix_scaled[train_index], feature_matrix_scaled[test_index]
    labels_train, labels_test = labels[train_index], labels[test_index]
    
    
    
    clf = ensemble.RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, class_weight = 'balanced_subsample', max_features= 'auto', bootstrap=True, random_state=32)
    clf.fit(features_train, labels_train)
    labels_predicted = clf.predict(features_test)
   
    cm = confusion_matrix(labels_test, labels_predicted)
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] # Normalize confusion map
    cm = np.around(cm, decimals=2)
    
    
    
    feature_importance += clf.feature_importances_
    feature_importance_errors += [list(clf.feature_importances_)]
    global_confusion = global_confusion + cm
    kappa_global += [cohen_kappa_score(labels_test, labels_predicted)]
    
#Averaged feature importance over the n_split folds
feature_importance = feature_importance / n_splits
#95% Confidence interval over the n_splits folds
feature_importance_errors = list(map(lambda x: (weightstats.DescrStatsW(x).tconfint_mean(0.05)[1]-weightstats.DescrStatsW(x).tconfint_mean(0.05)[0])/2, np.array(feature_importance_errors).transpose()))


#Mean Cohen's Kappa score and 95% CI of the mean over the n_splits folds 
kappa_global_mean = np.mean(kappa_global)
kappa_global_CI = weightstats.DescrStatsW(kappa_global).tconfint_mean(0.05)

#Average confusion matrix
global_confusion = global_confusion / n_splits
global_confusion = np.around(global_confusion, decimals=2)

## Feature ranking

In [189]:
feature_ranking = pd.DataFrame(np.array([feature_importance, feature_importance_errors]).transpose(), index = selected_features, columns = ['importance', 'CI']).sort_values(by = 'importance', ascending = False)
top_features = feature_ranking[feature_ranking.importance >.02]

In [190]:
fig = go.Figure()
fig.add_trace(go.Bar(
    name='Feature importance: top 20',
    x=top_features.index, y=top_features.importance,
    error_y=dict(type='data', array=top_features.CI)
))

fig.update_layout(barmode='group')
fig.show()

In [191]:
dance_list = list(dance_dict.keys())

fig = ff.create_annotated_heatmap(z=global_confusion, x=dance_list, y=dance_list)

#fig = go.Figure(data=go.Heatmap(z=cm, x=dance_list, y=dance_list))
fig.update_layout(title='Confusion Map For Dance Classification',
                  xaxis_title="Predicted labels", yaxis_title="True labels")
fig.data[0].update(zmin=0, zmax=1)
fig.show()

In [192]:
print(f"The mean Cohen's Kappa score over the {n_splits}-fold cross validation is {kappa_global_mean}, with the 95% confidence interval {kappa_global_CI}.")
      

The mean Cohen's Kappa score over the 10-fold cross validation is 0.4607994573864187, with the 95% confidence interval (0.37239771954367745, 0.5492011952291599).


# Leave one out

In [112]:
performance = []
for x in range(1, 60):
    feature_matrix = features[feature_ranking.iloc[:x].index].to_numpy()
    #feature_matrix = features.drop(columns=['dance', 'dance_num_label', 'path', 'gt_mode', 'id', 'D', 'no']).to_numpy()
    labels = features[['dance_num_label']].to_numpy()
    labels = np.ravel(labels)

    scaler = StandardScaler()
    scaler.fit(feature_matrix)
    feature_matrix_scaled = scaler.transform(feature_matrix)

    n_splits_one = len(labels)

    kf = KFold(n_splits=n_splits_one, shuffle=True)

    n_classes = len(np.unique(labels))
    predicted = []
    true = []


    for train_index, test_index in kf.split(feature_matrix_scaled, labels):
        features_train, features_test = feature_matrix_scaled[train_index], feature_matrix_scaled[test_index]
        labels_train, labels_test = labels[train_index], labels[test_index]
    
        #clf = svm.SVC(gamma='scale', C=100, decision_function_shape='ovr')
        #clf.fit(features_train, labels_train)
    
        clf = ensemble.RandomForestClassifier(n_estimators=50, criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features= 'auto', bootstrap=True)
        clf.fit(features_train, labels_train)
        labels_predicted = clf.predict(features_test)
   
        predicted += [labels_predicted]
        true += [labels_test]
    
    
    cm = confusion_matrix(true, predicted)
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] # Normalize confusion map
    cm = np.around(cm, decimals=2)
    

    #Cohen's Kappa score 
    kappa_one = cohen_kappa_score(true, predicted)
    performance += [(x, kappa_one)]
    
performance = pd.DataFrame(performance, columns = ['n_features', 'kappa'])
optimal_n_features = performance[performance.kappa == performance.kappa.max()].n_features
performance

Unnamed: 0,n_features,kappa
0,1,0.101699
1,2,0.210159
2,3,0.305753
3,4,0.286099
4,5,0.374166
5,6,0.320369
6,7,0.375824
7,8,0.364685
8,9,0.371648
9,10,0.388045


In [113]:
optimal_n_features

55    56
Name: n_features, dtype: int64

In [86]:
fig = ff.create_annotated_heatmap(z=cm, x=dance_list, y=dance_list)

#fig = go.Figure(data=go.Heatmap(z=cm, x=dance_list, y=dance_list))
fig.update_layout(title='Confusion Map For Dance Classification',
                  xaxis_title="Predicted labels", yaxis_title="True labels")
fig.data[0].update(zmin=0, zmax=1)
fig.show()

In [87]:
print(f'The "leave one out" predictions of the model show a Cohen Kappa score of {kappa_one}')

The "leave one out" predictions of the model show a Cohen Kappa score of 0.4252589606899839


# TSNE

In [144]:
 
feature_matrix = features[top_features.index].to_numpy()
#feature_matrix = features.drop(columns=['dance', 'dance_num_label', 'path', 'gt_mode', 'id', 'D', 'no']).to_numpy()
labels = features[['dance_num_label']].to_numpy()
labels = np.ravel(labels)

scaler = StandardScaler()
scaler.fit(feature_matrix)
feature_matrix_scaled = scaler.transform(feature_matrix)


tSNE(feature_matrix_scaled)

px.scatter(features, x = 'TSNE1', y = 'TSNE2', color = 'dance')

# French vs. German

In [677]:
national_dances = {'menuett' : 'french', 'trio': 'french', 'ländler': 'german', 'walzer': 'german', 'deutscher': 'german'  }
national_dances_num = {0: -1, 1: -3, 2: -1, 3: -2, 4: -2, 5: -1}

features_national = features.replace({'dance': national_dances, 'dance_num_label': national_dances_num })[features.dance != 'ecossaise']

feature_matrix_national = features_national[top_features.index].to_numpy()

labels_national = features_national[['dance_num_label']].to_numpy()

scaler_national = StandardScaler()
scaler_national.fit(feature_matrix_national)
feature_matrix_scaled_national = scaler_national.transform(feature_matrix_national)

labels_national = np.ravel(labels_national)
features_national_train, features_national_test, labels_national_train, labels_national_test = train_test_split(feature_matrix_scaled_national,
                                                                            labels_national,
                                                                            test_size=0.33,
                                                                            random_state=43)
clf_national = svm.SVC(gamma='scale', C=100, decision_function_shape='ovr')
clf_national.fit(features_national_train, labels_national_train)

labels_national_predicted = clf_national.predict(features_national_test)

cm_national = confusion_matrix(labels_national_test, labels_national_predicted)
cm_national = cm_national.astype('float') / cm_national.sum(axis=1)[:, np.newaxis] # Normalize confusion map
cm_national = np.around(cm_national, decimals=2)



fig = ff.create_annotated_heatmap(z=cm_national, x=['french', 'german'], y=['french', 'german'])
fig.update_layout(title='Confusion Map For Dance Classification',
                  xaxis_title="Predicted labels", yaxis_title="True labels")
fig

In [682]:
kappa_classifier_national = cohen_kappa_score(labels_national_test, labels_national_predicted)

print(f"The classification of dance types has a Cohen's kappa score of {kappa_classifier_national}")

The classification of dance types has a Cohen's kappa score of 0.8198866114260794
