In [2]:
import pickle
from sklearn.cluster import KMeans
import numpy as np
from sklearn.preprocessing import StandardScaler
from collections import Counter
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

In [3]:
# with open("../../data/extracted_features/mfcc_stats_that/mfcc_stats_that_v2.pickle", "rb") as file:
#     mfcc_stats_dict = pickle.load(file)

# with open("../../data/extracted_features/mfcc_stats_that/mfcc_stats_that_v2_unclean.pickle", "rb") as file:
#     mfcc_stats_dict = pickle.load(file)

with open("../../data/extracted_features/mfcc_stats_that/mfcc_stats_that_before_after.pickle", "rb") as file:
    mfcc_stats_dict = pickle.load(file)

for reader in mfcc_stats_dict.keys():
    print(f"reader: {reader} | # samples {len(mfcc_stats_dict[reader])}")
    # for mfcc, file_id in mfcc_stats_dict[reader]:
    #     print(f"\t 1st 2 mfcc: {mfcc} | # features: {mfcc.shape[0]} | file ID: {file_id}")

reader: 201 | # samples 61
reader: 311 | # samples 79
reader: 87 | # samples 72


In [4]:
def get_lowest_data(mfcc_dict):
    number_of_samples = []
    for reader in mfcc_dict.keys():
        number_of_samples.append(len(mfcc_dict[reader]))
    
    return min(number_of_samples)

def partition_data(reader, max, partition_percent=.80):
    mfccs = [mfcc for mfcc, _ in reader]
    mfccs = mfccs[0:max]

    partition_index = round(len(mfccs) * partition_percent)
    
    train = mfccs[0:partition_index]
    test = mfccs[partition_index: len(mfccs)]

    return train, test

def separate_labels(labels, speaker_keys):
    separated_labels = []
    current_index = 0
    for key in speaker_keys:
        separated_labels.append(labels[current_index:current_index+key])
        current_index += key
    return(separated_labels)

def calculate_percentage(data):
    percentages = []
    for sublist in data:
        total_count = len(sublist)
        if total_count == 0:
            percentages.append({})
            continue
        
        count_dict = {}
        for num in sublist:
            count_dict[num] = count_dict.get(num, 0) + 1
        
        percentage_dict = {num: (count / total_count) * 100 for num, count in count_dict.items()}
        percentages.append(percentage_dict)
    
    return percentages

def format_percentages(percentages):
    for i, percentage_dict in enumerate(percentages):
        if not percentage_dict:
            print(f"Sublist {i + 1}: No data")
            continue
        
        # Sort by percentage in descending order
        sorted_percentages = sorted(percentage_dict.items(), key=lambda x: x[1], reverse=True)
        
        # Format and print each sublist
        formatted_str = f"Speaker {i + 1}:\n\t" + ", ".join(
            f"{num}: {percent:.2f}%" for num, percent in sorted_percentages
        )
        print(formatted_str)

def create_truth_list(samples, label_order):
    """
    label_order: 
        0 for 1st half 0 and 2nd half 1
        1 for 1st half 1 and 2nd half 0
    """
    if label_order == 0:
        first_half  = [0 for _ in range(0, samples//2)]
        second_half = [1 for _ in range(0, samples//2)]        
    elif label_order == 1:
        first_half  = [1 for _ in range(0, samples//2)]
        second_half = [0 for _ in range(0, samples//2)]
    else:
        print("use 0 or 1")
    
    first_half.extend(second_half)
    return first_half


In [5]:
# mfccs_train_1 = [mfcc for mfcc, _ in mfcc_stats_dict["19"]]

# mfccs_train_1 = [mfcc for mfcc, _ in mfcc_stats_dict["26"]]
mfccs_train_2 = [mfcc for mfcc, _ in mfcc_stats_dict["201"]]
mfccs_train_3 = [mfcc for mfcc, _ in mfcc_stats_dict["311"]]

training_label_key = [len(mfccs_train_2), len(mfccs_train_3)]

# print(mfccs_train_1[0].shape)

# stacked = np.vstack([mfccs_train_1, mfccs_train_2, mfccs_train_3])
stacked = np.vstack([mfccs_train_2, mfccs_train_3])
print(stacked.shape)

kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(stacked)
labels = kmeans.labels_
separated_labels_train = separate_labels(labels, training_label_key)
format_percentages(calculate_percentage(separated_labels_train))
# print(labels)

(140, 142)
Speaker 1:
	0: 54.10%, 1: 45.90%
Speaker 2:
	1: 92.41%, 0: 7.59%


In [6]:
####################
# different gender #
# 87 woman 201 man #
####################

max_number_of_sample_87_201 = 61 # 201 has 61 samples and 87 has 72 cleaned before after
# max_number_of_sample_87_201 = 79 # 201 has 79 samples and 87 has 100 cleaned
# max_number_of_sample_87_201 = 76 # 201 has 76 samples and 87 has 116 unclean
print(f"max number of sampls 87 & 201: {max_number_of_sample_87_201}")

mfcc_train_87, mfcc_test_87 = partition_data(mfcc_stats_dict["87"], max_number_of_sample_87_201, partition_percent=.8)
mfcc_train_201, mfcc_test_201 = partition_data(mfcc_stats_dict["201"], max_number_of_sample_87_201, partition_percent=.8)

training_label_key_87_201 = [len(mfcc_train_87), len(mfcc_train_201)]

training_87_201 = np.vstack([mfcc_train_87, mfcc_train_201])

kmeans_87_201 = KMeans(n_clusters=2, random_state=42)
kmeans_87_201.fit(training_87_201)
labels_87_201 = kmeans_87_201.labels_

separated_labels_train_87_201 = separate_labels(labels_87_201, training_label_key_87_201)
format_percentages(calculate_percentage(separated_labels_train_87_201))

###################
#   same gender   #
# 201 man 311 man #
###################

max_number_of_sample_201_311 = 61 # 201 has 61 samples and 311 has 79 cleaned
# max_number_of_sample_201_311 = 79 # 201 has 79 samples and 311 has 105 cleaned
# max_number_of_sample_201_311 = 76 # 201 has 76 samples and 311 has 98 unclean
print(f"\nmax number of sampls 201 & 311 : {max_number_of_sample_201_311}")

mfcc_train_201, mfcc_test_201 = partition_data(mfcc_stats_dict["201"], max_number_of_sample_201_311, partition_percent=.8)
mfcc_train_311, mfcc_test_311 = partition_data(mfcc_stats_dict["311"], max_number_of_sample_201_311, partition_percent=.8)

training_label_key_201_311 = [len(mfcc_train_201), len(mfcc_train_311)]

training_201_311 = np.vstack([mfcc_train_201, mfcc_train_311])

kmeans_201_311 = KMeans(n_clusters=2, random_state=42)
kmeans_201_311.fit(training_201_311)
labels_201_311 = kmeans_201_311.labels_

separated_labels_train_201_311 = separate_labels(labels_201_311, training_label_key_201_311)
format_percentages(calculate_percentage(separated_labels_train_201_311))

max number of sampls 87 & 201: 61
Speaker 1:
	0: 93.88%, 1: 6.12%
Speaker 2:
	1: 75.51%, 0: 24.49%

max number of sampls 201 & 311 : 61
Speaker 1:
	0: 53.06%, 1: 46.94%
Speaker 2:
	1: 93.88%, 0: 6.12%


In [7]:
testing_label_key_87_201 = [len(mfcc_test_87), len(mfcc_test_201)]
testing_87_201 = np.vstack([mfcc_test_87, mfcc_test_201])

labels_prediction_87_201 = kmeans_87_201.predict(testing_87_201)

separated_labels_test_87_201 = separate_labels(labels_prediction_87_201, testing_label_key_87_201)
print("87 and 201")
format_percentages(calculate_percentage(separated_labels_test_87_201))


testing_label_key_201_311 = [len(mfcc_test_201), len(mfcc_test_311)]
testing_201_311 = np.vstack([mfcc_test_201, mfcc_test_311])

labels_prediction_201_311 = kmeans_201_311.predict(testing_201_311)

separated_labels_test_201_311 = separate_labels(labels_prediction_201_311, testing_label_key_201_311)
print("\n201 and 311")
format_percentages(calculate_percentage(separated_labels_test_201_311))


87 and 201
Speaker 1:
	0: 100.00%
Speaker 2:
	1: 75.00%, 0: 25.00%

201 and 311
Speaker 1:
	0: 58.33%, 1: 41.67%
Speaker 2:
	1: 91.67%, 0: 8.33%


In [8]:
ground_truth_87_201 = create_truth_list(len(labels_prediction_87_201), 0)

tn_87_201, fp_87_201, fn_87_201, tp_87_201 = confusion_matrix(ground_truth_87_201, labels_prediction_87_201).ravel()
f1_87_201 = f1_score(ground_truth_87_201, labels_prediction_87_201)
accuracy_87_201 = accuracy_score(ground_truth_87_201, labels_prediction_87_201)


print("87 and 201")
print("True Positives (TP):", tp_87_201)
print("True Negatives (TN):", tn_87_201)
print("False Positives (FP):", fp_87_201)
print("False Negatives (FN):", fn_87_201)

print(f"F1-Score: {f1_87_201}")
print(f"Accuracy: {accuracy_87_201}")

ground_truth_201_311 = create_truth_list(len(labels_prediction_201_311), 0)

tn_201_311, fp_201_311, fn_201_311, tp_201_311 = confusion_matrix(ground_truth_201_311, labels_prediction_201_311).ravel()
f1_201_311 = f1_score(ground_truth_201_311, labels_prediction_201_311)
accuracy_201_311 = accuracy_score(ground_truth_201_311, labels_prediction_201_311)

print("\n201 and 311")
print("True Positives (TP):", tp_201_311)
print("True Negatives (TN):", tn_201_311)
print("False Positives (FP):", fp_201_311)
print("False Negatives (FN):", fn_201_311)

print(f"F1-Score: {f1_201_311}")
print(f"Accuracy: {accuracy_201_311}")

87 and 201
True Positives (TP): 9
True Negatives (TN): 12
False Positives (FP): 0
False Negatives (FN): 3
F1-Score: 0.8571428571428571
Accuracy: 0.875

201 and 311
True Positives (TP): 11
True Negatives (TN): 7
False Positives (FP): 5
False Negatives (FN): 1
F1-Score: 0.7857142857142857
Accuracy: 0.75


## Cleaned
#### 87 and 201
* True Positives (TP): 11
* True Negatives (TN): 16
* False Positives (FP): 0
* False Negatives (FN): 5
* F1-Score: 0.8148148148148148
* Accuracy: 0.84375

#### 201 and 311
* True Positives (TP): 14
* True Negatives (TN): 10
* False Positives (FP): 6
* False Negatives (FN): 2
* F1-Score: 0.7777777777777778
* Accuracy: 0.75

## Uncleaned
#### 87 and 201
* True Positives (TP): 13
* True Negatives (TN): 14
* False Positives (FP): 1
* False Negatives (FN): 2
* F1-Score: 0.896551724137931
* Accuracy: 0.9

#### 201 and 311
* True Positives (TP): 12
* True Negatives (TN): 11
* False Positives (FP): 4
* False Negatives (FN): 3
* F1-Score: 0.7741935483870968
* Accuracy: 0.7666666666666667


