In [102]:
import os
import librosa
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
import joblib
import csv

In [96]:

# Function to extract features from a single audio file
def extract_features(audio_signal, sr=22050, n_mfcc=13):
    #print("Extracting features...")
    mfccs = librosa.feature.mfcc(y=audio_signal, sr=sr, n_mfcc=n_mfcc)
    #print(f"Extracted MFCCs shape: {mfccs.shape}")
    return mfccs.T

# Process all audio files in a directory to extract features
def process_audio_files(directory):
    all_features = []
    print(f"Extracting mfcc of all audio in {directory}")
    for filename in os.listdir(directory):
        if filename.endswith('.wav'):
            file_path = os.path.join(directory, filename)
            #print(f"Processing file: {file_path}")
            audio_signal, sr = librosa.load(file_path, sr=22050)
            #print(f"Loaded audio file with sample rate: {sr}")
            features = extract_features(audio_signal, sr)
            all_features.append(features)
    all_features = np.vstack(all_features)
    print(f"Extracted mfcc of all audio {directory}")
    print(f"Combined features shape: {all_features.shape}")
    return all_features

# Train a Universal Background Model (UBM)
def train_ubm(features, n_components=512):
    print("Training UBM...")
    gmm = GaussianMixture(n_components=n_components, covariance_type='diag')
    gmm.fit(features)
    print("UBM trained.")
    return gmm

# Train a total variability matrix (T-matrix)

def train_t_matrix(features, supervector_size, n_factors=400):
    print(f"Training T-matrix with {supervector_size} dimensions and {n_factors} factors...")
    pca = PCA(n_components=n_factors)
    pca.fit(features)
    t_matrix = np.random.rand(supervector_size, n_factors)  # Adjust to match supervector size
    print(f"T-matrix shape: {t_matrix.shape}")
    print("T-matrix trained.")
    return t_matrix


# Function to extract i-vectors using UBM and T-matrix
def extract_i_vectors(features, ubm, t_matrix):
    print("Extracting i-vector...")
    # Calculate posterior probabilities for each Gaussian component
    posteriors = ubm.predict_proba(features)
    
    # Calculate supervector by taking the weighted mean of the features
    supervector = np.dot(posteriors.T, features)
    
    # Flatten the supervector
    supervector = supervector.flatten()
    
    # Ensure that the T-matrix dimensions match the supervector size
    if t_matrix.shape[0] != supervector.size:
        raise ValueError(f"T-matrix and supervector size mismatch: T-matrix has {t_matrix.shape[0]} rows, supervector has {supervector.size} elements.")
    
    # Project the supervector onto the lower-dimensional i-vector space using the T-matrix
    i_vector = np.dot(supervector, t_matrix)
    
    # Normalize the i-vector (optional but common in i-vector extraction)
    i_vector /= np.linalg.norm(i_vector)
    
    print(f"i-vector shape: {i_vector.shape}")
    return i_vector



In [31]:
# Directory containing the large universal dataset for training
universal_dataset_directory = '/kaggle/input/mic-dev-eval/mic'

# Process the universal dataset and extract features
print("Processing universal dataset...")
universal_features = process_audio_files(universal_dataset_directory)

Processing universal dataset...
Extracting mfcc of all audio in /kaggle/input/mic-dev-eval/mic
Extracted mfcc of all audio /kaggle/input/mic-dev-eval/mic
Combined features shape: (584029, 13)


In [99]:
ubm_model_path = 'ubm_model.joblib'
t_matrix_path = 't_matrix.joblib'

# Train UBM and T-matrix on the universal dataset
if os.path.exists(ubm_model_path):
    print("Loading UBM model from file...")
    ubm = joblib.load(ubm_model_path)
else:
    ubm = train_ubm(universal_features)
    joblib.dump(ubm, ubm_model_path)
    print(f"UBM model saved to {ubm_model_path}")

# Load or train T-matrix
if os.path.exists(t_matrix_path):
    print("Loading T-matrix from file...")
    t_matrix = joblib.load(t_matrix_path)
else:
    t_matrix = train_t_matrix(universal_features, supervector_size=6656, n_factors=13)
    joblib.dump(t_matrix, t_matrix_path)
    print(f"T-matrix saved to {t_matrix_path}")


Loading UBM model from file...
Training T-matrix with 6656 dimensions and 13 factors...
T-matrix shape: (6656, 13)
T-matrix trained.
T-matrix saved to t_matrix.joblib


In [101]:


individual_files_directory = '/kaggle/input/tut-2016/TUT_2016/TUT_Acoustic_scenes_development_all_in_one'
# File to save the i-vectors
csv_file = "i_vectors.csv"

# Open the CSV file in write mode
with open(csv_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    # Write header
    writer.writerow(['Filename'] + [f'i_vector_{i}' for i in range(t_matrix.shape[1])])

    # Process each individual audio file and extract its i-vector
    for filename in os.listdir(individual_files_directory):
        if filename.endswith('.wav'):
            file_path = os.path.join(individual_files_directory, filename)
            print(f"\nProcessing individual file: {file_path}")
            audio_signal, sr = librosa.load(file_path, sr=22050)
            print(f"Loaded audio file with sample rate: {sr}")
            features = extract_features(audio_signal, sr)
            
            # Extract i-vector for the current file using the pre-trained UBM and T-matrix
            i_vector = extract_i_vectors(features, ubm, t_matrix)
            
            # Save the filename and i-vector to the CSV
            writer.writerow([filename] + i_vector.tolist())

        print(f'i-vector for {filename} saved to CSV.')



Processing individual file: /kaggle/input/tut-2016/TUT_2016/TUT_Acoustic_scenes_development_all_in_one/a101_180_210_class2.wav
Loaded audio file with sample rate: 22050
Extracting i-vector...
i-vector shape: (13,)
i-vector for a101_180_210_class2.wav saved to CSV.

Processing individual file: /kaggle/input/tut-2016/TUT_2016/TUT_Acoustic_scenes_development_all_in_one/a079_120_150_class13.wav
Loaded audio file with sample rate: 22050
Extracting i-vector...
i-vector shape: (13,)
i-vector for a079_120_150_class13.wav saved to CSV.

Processing individual file: /kaggle/input/tut-2016/TUT_2016/TUT_Acoustic_scenes_development_all_in_one/a107_210_240_class1.wav
Loaded audio file with sample rate: 22050
Extracting i-vector...
i-vector shape: (13,)
i-vector for a107_210_240_class1.wav saved to CSV.

Processing individual file: /kaggle/input/tut-2016/TUT_2016/TUT_Acoustic_scenes_development_all_in_one/a059_30_60_class4.wav
Loaded audio file with sample rate: 22050
Extracting i-vector...
i-vector 