In [19]:
import pandas as pd
import csv
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score, KFold
from sklearn.svm import SVC,LinearSVC
import numpy as np
from sklearn.metrics import accuracy_score,f1_score,silhouette_score,make_scorer, hamming_loss
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings("ignore")

In [20]:
df = pd.read_csv('../data/Anuran Calls (MFCCs)/Frogs_MFCCs.csv')
df

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,...,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22,Family,Genus,Species,RecordID
0,1.0,0.152936,-0.105586,0.200722,0.317201,0.260764,0.100945,-0.150063,-0.171128,0.124676,...,-0.108351,-0.077623,-0.009568,0.057684,0.118680,0.014038,Leptodactylidae,Adenomera,AdenomeraAndre,1
1,1.0,0.171534,-0.098975,0.268425,0.338672,0.268353,0.060835,-0.222475,-0.207693,0.170883,...,-0.090974,-0.056510,-0.035303,0.020140,0.082263,0.029056,Leptodactylidae,Adenomera,AdenomeraAndre,1
2,1.0,0.152317,-0.082973,0.287128,0.276014,0.189867,0.008714,-0.242234,-0.219153,0.232538,...,-0.050691,-0.023590,-0.066722,-0.025083,0.099108,0.077162,Leptodactylidae,Adenomera,AdenomeraAndre,1
3,1.0,0.224392,0.118985,0.329432,0.372088,0.361005,0.015501,-0.194347,-0.098181,0.270375,...,-0.136009,-0.177037,-0.130498,-0.054766,-0.018691,0.023954,Leptodactylidae,Adenomera,AdenomeraAndre,1
4,1.0,0.087817,-0.068345,0.306967,0.330923,0.249144,0.006884,-0.265423,-0.172700,0.266434,...,-0.048885,-0.053074,-0.088550,-0.031346,0.108610,0.079244,Leptodactylidae,Adenomera,AdenomeraAndre,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7190,1.0,-0.554504,-0.337717,0.035533,0.034511,0.443451,0.093889,-0.100753,0.037087,0.081075,...,0.069430,0.071001,0.021591,0.052449,-0.021860,-0.079860,Hylidae,Scinax,ScinaxRuber,60
7191,1.0,-0.517273,-0.370574,0.030673,0.068097,0.402890,0.096628,-0.116460,0.063727,0.089034,...,0.061127,0.068978,0.017745,0.046461,-0.015418,-0.101892,Hylidae,Scinax,ScinaxRuber,60
7192,1.0,-0.582557,-0.343237,0.029468,0.064179,0.385596,0.114905,-0.103317,0.070370,0.081317,...,0.082474,0.077771,-0.009688,0.027834,-0.000531,-0.080425,Hylidae,Scinax,ScinaxRuber,60
7193,1.0,-0.519497,-0.307553,-0.004922,0.072865,0.377131,0.086866,-0.115799,0.056979,0.089316,...,0.051796,0.069073,0.017963,0.041803,-0.027911,-0.096895,Hylidae,Scinax,ScinaxRuber,60


In [21]:
hamming_dist = [] 
hamming_loss = []
def report_hamming(major, pred_y, train_y):
    miss = 0
    for i in range(len(major)):
        idx = np.nonzero(pred_y == i)[0]
        train = train_y.iloc[idx].values
        pred = major.loc[i].values
        for j in range(len(train)):
            miss += np.sum(train[j] != pred)
    num_rows, num_cols = df.shape
    cur_dist = miss / num_rows
    cur_loss = miss / num_rows * num_cols
    hamming_dist.append(cur_dist)
    hamming_loss.append(cur_loss)
    print("")
    print(f"Hamming Distance: {round(cur_dist, 4)}, Hamming Loss: {round(cur_loss, 4)}")
    print("-----------------------------------------------")

In [22]:
def report_major_class(optimal_k, pred_y, train_y):
    #create an emtpy data frame to store the major class for each class with k=1...best_k
    major = pd.DataFrame(columns = train_y.columns)
    for i in range(optimal_k):
        idx = np.nonzero(pred_y == i)[0]
        #find the data corresponding to idx in train_y
        cluster_samples = train_y.iloc[idx, :]
        train = train_y.columns
        row = []
        for j in range(len(train)):
            #find the major class for each label
            cur = train_y.iloc[idx, :].loc[:, train[j]].value_counts().index[0]
            #append it to the row of major class of corresponding k
            row.append(cur)
        major.loc[i] = row
    print(major)
    return major

In [23]:
def silhouette_optimal_k(train_x,rand):
    optimal_k = 2 
    max_score = 0
    for i in range(2, 5):
        model = KMeans(n_clusters = i)
        pred_y = model.fit_predict(train_x)
        silhouette_avg = silhouette_score(train_x, pred_y)
        if silhouette_avg > max_score:
            optimal_k = i
            max_score = silhouette_avg
    print("optimal K of this iteration: ", optimal_k)
    print("")
    return optimal_k

In [24]:
def monte_carlo(iters, train_x, train_y):
    for i in range(iters):
        print("Iteration: ", i+1)
        optimal_k = silhouette_optimal_k(train_x,i)
        model = KMeans(n_clusters = optimal_k)
        #Compute cluster centers and predict cluster index for each sample.
        pred_y = model.fit_predict(train_x)
        
        #Get the major class for each class with k=1...best_k
        major = report_major_class(optimal_k, pred_y, train_y)

        #compute hamming distance and hamming loss
        report_hamming(major, pred_y, train_y)

In [25]:
monte_carlo(10, df.iloc[:, :-4], df.iloc[:, -4:-1])

Iteration:  1
optimal K of this iteration:  4

            Family      Genus                 Species
0          Hylidae  Hypsiboas       HypsiboasCordobae
1  Leptodactylidae  Adenomera  AdenomeraHylaedactylus
2  Leptodactylidae  Adenomera          AdenomeraAndre
3          Hylidae  Hypsiboas       HypsiboasCordobae

Hamming Distance: 0.7358, Hamming Loss: 19.1305
-----------------------------------------------
Iteration:  2
optimal K of this iteration:  4

            Family      Genus                 Species
0  Leptodactylidae  Adenomera  AdenomeraHylaedactylus
1          Hylidae  Hypsiboas       HypsiboasCordobae
2  Leptodactylidae  Adenomera          AdenomeraAndre
3          Hylidae  Hypsiboas       HypsiboasCordobae

Hamming Distance: 0.7358, Hamming Loss: 19.1305
-----------------------------------------------
Iteration:  3
optimal K of this iteration:  4

            Family      Genus                 Species
0    Dendrobatidae   Ameerega      Ameeregatrivittata
1  Leptodactylida

In [4]:
# utility function to get the optimal value of K
def getOptimalK(num_cluster, X, rand):
    optimalK, max_score = 2, 0
    num_cluster = num_cluster + 1
    for n in range(2, num_cluster):
        clusterer = KMeans(n_clusters=n, random_state=rand)
        cluster_labels = clusterer.fit_predict(X)
        silhouette_avg = silhouette_score(X, cluster_labels)
        if silhouette_avg > max_score:
            optimalK = n
            max_score = silhouette_avg
    print(f"\nThe optimal K is: {optimalK}")
    return optimalK
# utility function to get majority labels of a cluster
def getMajorityLabels(optimalK, cluster_labels, Y):
    cluster_major = pd.DataFrame(columns=Y.columns)
    for c in range(optimalK):
        idx, = np.where(cluster_labels == c)
        cluster_samples = Y.iloc[idx, :]
        row = []
        for label in Y.columns:
            cur_major = cluster_samples.loc[:, label].value_counts().index[0]
            row.append(cur_major)
        cluster_major.loc[c] = row
    return cluster_major
# utility function to calculate and get hamming distance/loss
def evaluation(cluster_major, cluster_labels, Y):
    missclf_labels = 0
    for c in range(len(cluster_major)):
        idx, = np.where(cluster_labels == c)
        for label in Y.loc[idx].values:
            miss = (label != cluster_major.loc[c].values)
            missclf_labels += np.sum(miss)
    hamming_dist = missclf_labels / Y.shape[0]
    hamming_loss = missclf_labels / (Y.shape[0] * Y.shape[1])
    return hamming_dist, hamming_loss
def monteCarlo(times, X, Y):
    hamming_dist = []
    hamming_loss = []
    for i in range(times):
        optimalK = getOptimalK(5, X, i)
        clusterer = KMeans(n_clusters=optimalK, random_state=i)
        cluster_labels = clusterer.fit_predict(X)
        cluster_major = getMajorityLabels(optimalK, cluster_labels, Y)
#         cluster_major = cluster_major.astype('int64')
        cur_dist, cur_loss = evaluation(cluster_major, cluster_labels, Y)
        hamming_dist.append(cur_dist)
        hamming_loss.append(cur_loss)
        print(f"Iteration {i + 1} | Hamming Distance: {round(cur_dist, 4)}, Hamming Loss: {round(cur_loss, 4)}")
    return hamming_dist, hamming_loss
iterations = 10
hamming_dist, hamming_loss = monteCarlo(iterations, df.iloc[:, :-4], df.iloc[:, -4:-1])


The optimal K is: 4
Iteration 1 | Hamming Distance: 0.6673, Hamming Loss: 0.2224

The optimal K is: 4
Iteration 2 | Hamming Distance: 0.6673, Hamming Loss: 0.2224

The optimal K is: 4
Iteration 3 | Hamming Distance: 0.7354, Hamming Loss: 0.2451

The optimal K is: 4
Iteration 4 | Hamming Distance: 0.6673, Hamming Loss: 0.2224

The optimal K is: 4
Iteration 5 | Hamming Distance: 0.6673, Hamming Loss: 0.2224

The optimal K is: 4
Iteration 6 | Hamming Distance: 0.6673, Hamming Loss: 0.2224

The optimal K is: 4
Iteration 7 | Hamming Distance: 0.6673, Hamming Loss: 0.2224

The optimal K is: 4
Iteration 8 | Hamming Distance: 0.6673, Hamming Loss: 0.2224

The optimal K is: 4
Iteration 9 | Hamming Distance: 0.6673, Hamming Loss: 0.2224

The optimal K is: 4
Iteration 10 | Hamming Distance: 0.6673, Hamming Loss: 0.2224
