In [1]:
import pickle
import numpy as np

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
features = "mfcc_13_no_pitch_1000_rand_all_speakers"

with open(f"../../../data/extracted_features_v2/{features}.pickle", "rb") as file:
   mfcc_stats_dict = pickle.load(file)

print(len(mfcc_stats_dict.keys()))

for reader in mfcc_stats_dict.keys():
    print(f"reader: {reader} | # samples {len(mfcc_stats_dict[reader])}")

245
reader: 103 | # samples 1000
reader: 1034 | # samples 1000
reader: 1069 | # samples 1000
reader: 1081 | # samples 1000
reader: 1088 | # samples 1000
reader: 1098 | # samples 1000
reader: 1116 | # samples 1000
reader: 118 | # samples 1000
reader: 1235 | # samples 1000
reader: 1246 | # samples 1000
reader: 125 | # samples 1000
reader: 1263 | # samples 1000
reader: 1334 | # samples 1000
reader: 1355 | # samples 1000
reader: 1363 | # samples 1000
reader: 1447 | # samples 1000
reader: 1455 | # samples 1000
reader: 150 | # samples 1000
reader: 1502 | # samples 1000
reader: 1553 | # samples 1000
reader: 1578 | # samples 1000
reader: 1594 | # samples 1000
reader: 1624 | # samples 1000
reader: 163 | # samples 1000
reader: 1723 | # samples 1000
reader: 1737 | # samples 1000
reader: 1743 | # samples 1000
reader: 1841 | # samples 1000
reader: 1867 | # samples 1000
reader: 1898 | # samples 1000
reader: 19 | # samples 1000
reader: 1926 | # samples 1000
reader: 196 | # samples 1000
reader: 1963 |

In [3]:
def scale_features(data_dict):
    """
    Scales the data in a dictionary using StandardScaler.
    
    Parameters:
        data_dict (dict): Dictionary where each key is a subject, and the value is a 2D array 
                          (samples x features).
                          
    Returns:
        dict: A new dictionary with scaled data, maintaining the same structure as the input.
    """
    # Combine all data into a single array
    all_samples = np.vstack(list(data_dict.values()))  # Shape: (total_samples, num_features)

    # Scale the data
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(all_samples)  # Shape: (total_samples, num_features)

    # Split scaled data back into original structure
    split_indices = np.cumsum([len(samples) for samples in data_dict.values()])[:-1]
    scaled_subjects = np.split(scaled_data, split_indices)

    # Reconstruct the dictionary with scaled data
    scaled_data_dict = {key: scaled_subjects[i] for i, key in enumerate(data_dict.keys())}

    return scaled_data_dict

def scale_and_optimize_pca(data_dict, variance_threshold=0.95):
    """
    Scales the data in a dictionary using StandardScaler, determines the optimal 
    number of PCA components based on the explained variance threshold, 
    applies PCA, and returns the transformed data in the same dictionary structure.

    Parameters:
        data_dict (dict): Dictionary where each key is a subject, and the value is a 2D array 
                          (samples x features).
        variance_threshold (float): The minimum cumulative explained variance (0 to 1) 
                                    to determine the number of PCA components.
                          
    Returns:
        tuple:
            dict: A new dictionary with PCA-transformed data, maintaining the same structure as the input.
            int: The number of PCA components selected.
    """
    # Combine all data into a single array
    all_samples = np.vstack(list(data_dict.values()))  # Shape: (total_samples, num_features)

    # Scale the data
    scaler = StandardScaler()
    standardized_data = scaler.fit_transform(all_samples)  # Shape: (total_samples, num_features)

    # Fit PCA on standardized data to find optimal number of components
    pca = PCA()
    pca.fit(standardized_data)
    cumsum = np.cumsum(pca.explained_variance_ratio_)
    n_components = np.argmax(cumsum >= variance_threshold) + 1

    # Apply PCA with the optimal number of components
    pca = PCA(n_components=n_components)
    pca_data = pca.fit_transform(standardized_data)  # Shape: (total_samples, n_components)

    # Split PCA-transformed data back into original structure
    split_indices = np.cumsum([len(samples) for samples in data_dict.values()])[:-1]
    pca_subjects = np.split(pca_data, split_indices)

    # Reconstruct the dictionary with PCA-transformed data
    pca_data_dict = {key: pca_subjects[i] for i, key in enumerate(data_dict.keys())}

    return pca_data_dict, n_components

In [4]:
pca_transformed_data, optimal_components = scale_and_optimize_pca(mfcc_stats_dict)

for key in pca_transformed_data.keys():
    print(f"key: {key} shape: {pca_transformed_data[key].shape}")

key: 103 shape: (1000, 46)
key: 1034 shape: (1000, 46)
key: 1069 shape: (1000, 46)
key: 1081 shape: (1000, 46)
key: 1088 shape: (1000, 46)
key: 1098 shape: (1000, 46)
key: 1116 shape: (1000, 46)
key: 118 shape: (1000, 46)
key: 1235 shape: (1000, 46)
key: 1246 shape: (1000, 46)
key: 125 shape: (1000, 46)
key: 1263 shape: (1000, 46)
key: 1334 shape: (1000, 46)
key: 1355 shape: (1000, 46)
key: 1363 shape: (1000, 46)
key: 1447 shape: (1000, 46)
key: 1455 shape: (1000, 46)
key: 150 shape: (1000, 46)
key: 1502 shape: (1000, 46)
key: 1553 shape: (1000, 46)
key: 1578 shape: (1000, 46)
key: 1594 shape: (1000, 46)
key: 1624 shape: (1000, 46)
key: 163 shape: (1000, 46)
key: 1723 shape: (1000, 46)
key: 1737 shape: (1000, 46)
key: 1743 shape: (1000, 46)
key: 1841 shape: (1000, 46)
key: 1867 shape: (1000, 46)
key: 1898 shape: (1000, 46)
key: 19 shape: (1000, 46)
key: 1926 shape: (1000, 46)
key: 196 shape: (1000, 46)
key: 1963 shape: (1000, 46)
key: 1970 shape: (1000, 46)
key: 198 shape: (1000, 46)
k

In [5]:
with open(f"../../../data/extracted_features_v2/pca_{features}.pickle", "wb") as file:
    pickle.dump(pca_transformed_data, file)