In [3]:

import pickle
from collections import Counter
import numpy as np

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

In [4]:
# with open("../../data/extracted_features/mfcc_stats_that/mfcc_stats_that_v2.pickle", "rb") as file:
#     mfcc_stats_dict = pickle.load(file)

# with open("../../data/extracted_features/mfcc_stats_that/mfcc_stats_that_v2_unclean.pickle", "rb") as file:
#     mfcc_stats_dict = pickle.load(file)

with open("../../data/extracted_features/mfcc_stats_that/mfcc_stats_that_before_after.pickle", "rb") as file:
    mfcc_stats_dict = pickle.load(file)

for reader in mfcc_stats_dict.keys():
    print(f"reader: {reader} | # samples {len(mfcc_stats_dict[reader])}")
    # for mfcc, file_id in mfcc_stats_dict[reader]:
    #     print(f"\t 1st 2 mfcc: {mfcc} | # features: {mfcc.shape[0]} | file ID: {file_id}")

reader: 201 | # samples 61
reader: 311 | # samples 79
reader: 87 | # samples 72


In [6]:
def get_lowest_data(mfcc_dict):
    number_of_samples = []
    for reader in mfcc_dict.keys():
        number_of_samples.append(len(mfcc_dict[reader]))
    
    return min(number_of_samples)

def partition_data(reader, max):
    mfccs = [mfcc for mfcc, _ in reader]
    mfccs = mfccs[0:max]

    return mfccs

def separate_labels(labels, speaker_keys):
    separated_labels = []
    current_index = 0
    for key in speaker_keys:
        separated_labels.append(labels[current_index:current_index+key])
        current_index += key
    return(separated_labels)

def calculate_percentage(data):
    percentages = []
    for sublist in data:
        total_count = len(sublist)
        if total_count == 0:
            percentages.append({})
            continue
        
        count_dict = {}
        for num in sublist:
            count_dict[num] = count_dict.get(num, 0) + 1
        
        percentage_dict = {num: (count / total_count) * 100 for num, count in count_dict.items()}
        percentages.append(percentage_dict)
    
    return percentages

def format_percentages(percentages):
    for i, percentage_dict in enumerate(percentages):
        if not percentage_dict:
            print(f"Sublist {i + 1}: No data")
            continue
        
        # Sort by percentage in descending order
        sorted_percentages = sorted(percentage_dict.items(), key=lambda x: x[1], reverse=True)
        
        # Format and print each sublist
        formatted_str = f"Speaker {i + 1}:\n\t" + ", ".join(
            f"{num}: {percent:.2f}%" for num, percent in sorted_percentages
        )
        print(formatted_str)

def create_truth_list(samples, label_order):
    """
    label_order: 
        0 for 1st half 0 and 2nd half 1
        1 for 1st half 1 and 2nd half 0
    """
    if label_order == 0:
        first_half  = [0 for _ in range(0, samples//2)]
        second_half = [1 for _ in range(0, samples//2)]        
    elif label_order == 1:
        first_half  = [1 for _ in range(0, samples//2)]
        second_half = [0 for _ in range(0, samples//2)]
    else:
        print("use 0 or 1")
    
    first_half.extend(second_half)
    return first_half

def create_binary_labels(length):
    return [(0 if i<length else 1) for i in range(0,2*length)]


In [None]:
####################
# different gender #
# 87 woman 201 man #
####################

max_number_of_sample_87_201 = 61 # 201 has 61 samples and 87 has 72 cleaned before after
# max_number_of_sample_87_201 = 79 # 201 has 79 samples and 87 has 100 cleaned
# max_number_of_sample_87_201 = 76 # 201 has 76 samples and 87 has 116 unclean

print(f"max number of samples 87 & 201: {max_number_of_sample_87_201}")

mfcc_87 = partition_data(mfcc_stats_dict["87"], max_number_of_sample_87_201)
mfcc_201 = partition_data(mfcc_stats_dict["201"], max_number_of_sample_87_201)

data_87_201 = np.vstack([mfcc_87, mfcc_201])

binary_label_87_201 = create_binary_labels(max_number_of_sample_87_201)

# Split the data
data_train_87_201, data_test_87_201, label_train_87_201, label_test_87_201 = train_test_split(data_87_201, binary_label_87_201, test_size=0.2, random_state=42)

# Initialize and train the model
svm_model_87_201 = SVC(kernel='rbf', C=1.0, gamma='scale')
svm_model_87_201.fit(data_train_87_201, label_train_87_201)

# Make predictions and evaluate
test_pred_87_201 = svm_model_87_201.predict(data_test_87_201)
accuracy_87_201 = accuracy_score(label_test_87_201, test_pred_87_201)
print("Accuracy:", accuracy_87_201)

tn_87_201, fp_87_201, fn_87_201, tp_87_201 = confusion_matrix(label_test_87_201, test_pred_87_201).ravel()
f1_87_201 = f1_score(label_test_87_201, test_pred_87_201)

print("True Positives (TP):", tp_87_201)
print("True Negatives (TN):", tn_87_201)
print("False Positives (FP):", fp_87_201)
print("False Negatives (FN):", fn_87_201)

print(f"F1-Score: {f1_87_201}")

###################
#   same gender   #
# 201 man 311 man #
###################

max_number_of_sample_201_311 = 61 # 201 has 61 samples and 311 has 79 cleaned
# max_number_of_sample_201_311 = 79 # 201 has 79 samples and 311 has 105 cleaned
# max_number_of_sample_201_311 = 76 # 201 has 76 samples and 311 has 98 unclean

print(f"\nmax number of samples 201 & 311 : {max_number_of_sample_201_311}")

mfcc_201 = partition_data(mfcc_stats_dict["201"], max_number_of_sample_201_311)
mfcc_311 = partition_data(mfcc_stats_dict["311"], max_number_of_sample_201_311)

data_201_311 = np.vstack([mfcc_201, mfcc_311])

binary_label_201_311 = create_binary_labels(max_number_of_sample_201_311)

# Split the data
data_train_201_311, data_test_201_311, label_train_201_311, label_test_201_311 = train_test_split(data_201_311, binary_label_201_311, test_size=0.2, random_state=42)

# Initialize and train the model
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale')
svm_model.fit(data_train_201_311, label_train_201_311)

# Make predictions and evaluate
test_pred_201_311 = svm_model.predict(data_test_201_311)
accuracy_201_311 = accuracy_score(label_test_201_311, test_pred_201_311)
print("Accuracy:", accuracy_201_311)

tn_201_311, fp_201_311, fn_201_311, tp_201_311 = confusion_matrix(label_test_201_311, test_pred_201_311).ravel()
f1_201_311 = f1_score(label_test_201_311, test_pred_201_311)

print("True Positives (TP):", tp_201_311)
print("True Negatives (TN):", tn_201_311)
print("False Positives (FP):", fp_201_311)
print("False Negatives (FN):", fn_201_311)

print(f"F1-Score: {f1_201_311}")

max number of samples 87 & 201: 72
Accuracy: 1.0
True Positives (TP): 13
True Negatives (TN): 16
False Positives (FP): 0
False Negatives (FN): 0
F1-Score: 1.0

max number of samples 201 & 311 : 61
Accuracy: 0.6
True Positives (TP): 9
True Negatives (TN): 6
False Positives (FP): 9
False Negatives (FN): 1
F1-Score: 0.6428571428571429


## Cleaned
#### 87 and 201
* max number of samples 87 & 201: 79
* Accuracy: 1.0
* True Positives (TP): 17
* True Negatives (TN): 15
* False Positives (FP): 0
* False Negatives (FN): 0
* F1-Score: 1.0

#### 201 and 311
* max number of samples 201 & 311 : 79
* Accuracy: 0.78125
* True Positives (TP): 14
* True Negatives (TN): 11
* False Positives (FP): 4
* False Negatives (FN): 3
* F1-Score: 0.8

## Uncleaned
#### 87 and 201
* max number of samples 87 & 201: 76
* Accuracy: 0.9354838709677419
* True Positives (TP): 15
* True Negatives (TN): 14
* False Positives (FP): 2
* False Negatives (FN): 0
* F1-Score: 0.9375

#### 201 and 311
* max number of samples 201 & 311 : 76
* Accuracy: 0.8064516129032258
* True Positives (TP): 13
* True Negatives (TN): 12
* False Positives (FP): 4
* False Negatives (FN): 2
* F1-Score: 0.8125