In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv('Anuran Calls (MFCCs)/Frogs_MFCCs.csv')
data = data.drop(['RecordID'], axis=1)

d1 = dict(zip(list(dict(data['Family'].value_counts()).keys()), list(np.arange(4))))
d2 = dict(zip(list(dict(data['Genus'].value_counts()).keys()), list(np.arange(8))))
d3 = dict(zip(list(dict(data['Species'].value_counts()).keys()), list(np.arange(10))))

for i in range(data.shape[0]):
    data.loc[i, 'Family'] = d1.get(data.loc[i, 'Family'])
    data.loc[i, 'Genus'] = d2.get(data.loc[i, 'Genus'])
    data.loc[i, 'Species'] = d3.get(data.loc[i, 'Species'])
    
features = data.iloc[:,:-3]
true_labels = data[['Family', 'Genus', 'Species']]

In [2]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabaz_score

best_km = None
best_k = 0
ham_distance = []
range_k_clusters = list(range(2, 21, 1))
temp = -1

for k in range_k_clusters:
    km = KMeans(n_clusters=k).fit(features)
    labels = km.predict(features)
    silhouette_avg = silhouette_score(features, labels) # The silhouette_score gives the average value for all the samples.
    if silhouette_avg > temp:
        temp = silhouette_avg
        best_k = k
        best_km = km
labels = best_km.labels_

print('We choose k = %d based on Silhouettes Analysis method.' % best_k)

We choose k = 4 based on Silhouettes Analysis method.


In [3]:
label_count = []
for k in range(best_k):
    label_count.append([])
    label_count[k].append(dict())
    label_count[k].append(dict())
    label_count[k].append(dict())
for i, x in enumerate(labels):
    label_count[x][0].setdefault(true_labels.loc[i,'Family'], 0)
    label_count[x][0][true_labels.loc[i,'Family']] += 1
    label_count[x][1].setdefault(true_labels.loc[i,'Genus'], 0)
    label_count[x][1][true_labels.loc[i,'Genus']] += 1
    label_count[x][2].setdefault(true_labels.loc[i,'Species'], 0)
    label_count[x][2][true_labels.loc[i,'Species']] += 1
print('Counting the labels in each cluster:', label_count)
for i in label_count:
    i[0] = max(i[0].keys(), key=lambda x: i[0][x])
    i[1] = max(i[1].keys(), key=lambda x: i[1][x])
    i[2] = max(i[2].keys(), key=lambda x: i[2][x])
print('In each of the %d clusters,' % best_k)
print('we determine the majority of family, genus and species as', label_count)
print('Each list represents the majority of family, genus and species.')
pre = []
for n in labels:
    pre.append(label_count[n])
    
def ham_loss(y_true, y_pred):
    return np.sum(np.not_equal(y_true, y_pred))/float(y_true.size)
print('The average hamming score between the true labels and the labels assigned by clusters is:', ham_loss(true_labels.values, np.array(pre)))

Counting the labels in each cluster: [[{0: 302, 2: 500, 1: 229}, {0: 296, 2: 500, 3: 189, 1: 1, 4: 6, 5: 39}, {2: 296, 3: 500, 5: 189, 1: 1, 6: 6, 7: 39}], [{0: 3467, 1: 101}, {0: 3466, 3: 86, 1: 12, 4: 1, 5: 3}, {0: 3466, 5: 86, 1: 12, 6: 1, 7: 3}], [{0: 22, 1: 590, 3: 2}, {0: 12, 1: 542, 4: 10, 6: 38, 7: 2, 5: 10}, {2: 6, 0: 6, 4: 452, 1: 90, 6: 10, 8: 38, 9: 2, 7: 10}], [{0: 629, 2: 42, 1: 1245, 3: 66}, {0: 376, 2: 42, 3: 35, 1: 1038, 4: 253, 6: 76, 7: 66, 5: 96}, {2: 370, 3: 42, 0: 6, 5: 35, 4: 20, 1: 1018, 6: 253, 8: 76, 9: 66, 7: 96}]]
In each of the 4 clusters,
we determine the majority of family, genus and species as [[2, 2, 3], [0, 0, 0], [1, 1, 4], [1, 1, 1]]
Each list represents the majority of family, genus and species.
The average hamming score between the true labels and the labels assigned by clusters is: 0.2224229789205467


# Monte-Carlo Simulation
Perform the following procedures 50 times, and report the average and standard deviation of the 50 Hamming Distances.

In [4]:
best_km = None
best_k = 0
hammings = []
range_k_clusters = list(range(2, 21, 1))
temp = -1

for i in range(50): # repeat the procedures 50 times
    for k in range_k_clusters:
        km = KMeans(n_clusters=k).fit(features)
        labels = km.predict(features)
        silhouette_avg = silhouette_score(features, labels) # The silhouette_score gives the average value for all the samples.
        if silhouette_avg > temp:
            temp = silhouette_avg
            best_k = k
            best_km = km
    labels = best_km.labels_
    print('In the %d run, the best k we determined is %d' % (i+1, best_k))
    label_count = []
    for k in range(best_k):
        label_count.append([])
        label_count[k].append(dict())
        label_count[k].append(dict())
        label_count[k].append(dict())
    for i, x in enumerate(labels):
        label_count[x][0].setdefault(true_labels.loc[i,'Family'], 0)
        label_count[x][0][true_labels.loc[i,'Family']] += 1
        label_count[x][1].setdefault(true_labels.loc[i,'Genus'], 0)
        label_count[x][1][true_labels.loc[i,'Genus']] += 1
        label_count[x][2].setdefault(true_labels.loc[i,'Species'], 0)
        label_count[x][2][true_labels.loc[i,'Species']] += 1
    for i in label_count:
        i[0] = max(i[0].keys(), key=lambda x: i[0][x])
        i[1] = max(i[1].keys(), key=lambda x: i[1][x])
        i[2] = max(i[2].keys(), key=lambda x: i[2][x])

    pre = []
    for n in labels:
        pre.append(label_count[n])
    hamming = ham_loss(true_labels.values, np.array(pre))
    print('The hamming loss is:', hamming)
    hammings.append(hamming)

In the 1 run, the best k we determined is 4
The hamming score is: 0.2219133657632615
In the 2 run, the best k we determined is 4
The hamming score is: 0.2219133657632615
In the 3 run, the best k we determined is 4
The hamming score is: 0.2219133657632615
In the 4 run, the best k we determined is 4
The hamming score is: 0.2219133657632615
In the 5 run, the best k we determined is 4
The hamming score is: 0.2219133657632615
In the 6 run, the best k we determined is 4
The hamming score is: 0.2219133657632615
In the 7 run, the best k we determined is 4
The hamming score is: 0.2219133657632615
In the 8 run, the best k we determined is 4
The hamming score is: 0.2219133657632615
In the 9 run, the best k we determined is 4
The hamming score is: 0.2219133657632615
In the 10 run, the best k we determined is 4
The hamming score is: 0.2219133657632615
In the 11 run, the best k we determined is 4
The hamming score is: 0.2219133657632615
In the 12 run, the best k we determined is 4
The hamming score 

In [6]:
ham_avg = np.mean(hammings)
ham_std = np.std(hammings)
print('The average of the 50 hamming loss is:', ham_avg)
print('The standard deviation of the 50 hamming loss is:', ham_std)

The average of the 50 hamming loss is: 0.23633171183692375
The standard deviation of the 50 hamming loss is: 0.011287970327656287
