# Speaker Identification

In [1]:
import numpy as np 
from pathlib import Path 
from sklearn.neural_network import MLPClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import torch
import torchaudio 
from speechbrain.dataio.preprocess import AudioNormalizer
import librosa 

In [2]:
def load_audio(path):
    signal, sr = torchaudio.load(path, channels_first=False)
    return AudioNormalizer()(signal, sr)

CLIP_TIME = 10  # Use this length of audio. CLIP_TIME=10 means 10 seconds.
SAMPLE_RATE = 16000
base_dir = "../data/LibriSpeech"

def get_audio_signals(split_name="dev-clean", setting="one_utterance_per_speaker"):
    all_utterances = []
    for speaker in Path(base_dir, split_name).iterdir():
        speaker_collected = False 
        if not speaker.is_dir():
            continue 

        for session in speaker.iterdir():
            if speaker_collected:
                break 
            if not session.is_dir():
                continue 

            for utt in session.iterdir():
                if not str(utt).endswith("flac"):
                    continue 

                signal, sr = torchaudio.load(str(utt), channels_first=False)
                assert sr == SAMPLE_RATE, f"Expected sample rate is {SAMPLE_RATE}. Got {sr}"
                audio_len = len(signal) / sr 
                if audio_len > CLIP_TIME:
                    s = signal[: CLIP_TIME * sr, 0]
                    all_utterances.append(s)
                    speaker_collected = True 
                    break

    return torch.stack(all_utterances)

dev_speeches = get_audio_signals("dev-clean")  # (N_speaker, CLIP_TIME * SAMPLE_RATE)

In [3]:
def preprocess_data(speeches):
    mfccs = []
    for sig in speeches:
        y = np.array(sig)
        mfcc = librosa.feature.mfcc(y=y, sr=SAMPLE_RATE, n_mfcc=20)  # (n_mfcc, n_frame)
        mfccs.append(mfcc)
    return np.array(mfccs) 

dev_mfccs = preprocess_data(dev_speeches)  # (N_speaker, n_mfcc, n_frame)

### How long is the total audio?  
There are 250+40+40 speakers (train/dev/test folder), totaling 330 speakers. If we take 10s for each speaker, then total audio is 3300s -- might take up to 1 hour in transcription. For coding and debugging, just use dev set (400s) for now.  
Also: Use the 0-5 seconds as train; 6-10 as validation.

### How to do speaker identification?  
- Attempt 1: Concatenate all MFCC of a speaker. Run LogReg. Accuracy = 0.20  
- Attempt 2: Use one frame of MFCC. Acc = 0.51 (LogReg), 0.54 (MLPClassifier)  
- Attempt 3: Concatenate some frames of MFCC, and use the rest as data samples. The highest acc is 0.54; not significantly better than using one frame of MFCC.  
- Attempt 4: Concatenate some frames of MFCC. Use LSTM. Not good; resulted in acc of around 0.22. Probably smaller batches can be more useful.  

I'm going to proceed with using just one frame of MFCC henceforth.

In [4]:
def speaker_identification_experiment1(train_speeches, val_speeches):
    N_speaker = len(train_speeches)
    labels = [i for i in range(N_speaker)]
    train_feats = preprocess_data(train_speeches).reshape(N_speaker, -1)  
    valid_feats = preprocess_data(val_speeches).reshape(N_speaker, -1)

    model = LogisticRegression()
    model.fit(train_feats, labels)
    pred = model.predict(valid_feats)
    print(accuracy_score(labels, pred))

speaker_identification_experiment1(
    dev_speeches[:, :5*SAMPLE_RATE],
    dev_speeches[:, 5*SAMPLE_RATE:])  # 1.5s

0.2


In [5]:
def speaker_identification_experiment2(train_speeches, val_speeches):
    
    train_feats = preprocess_data(train_speeches)  # (n_speaker, n_mfcc, n_frame)
    n_speaker = train_feats.shape[0]
    n_mfcc = train_feats.shape[1]
    n_frame = train_feats.shape[2]
    train_X = np.swapaxes(train_feats, 1, 2).reshape(-1, n_mfcc)

    valid_feats = preprocess_data(val_speeches)
    valid_X = np.swapaxes(valid_feats, 1, 2).reshape(-1, n_mfcc)

    labels = [speaker for speaker in range(n_speaker) for fr in range(n_frame) ]

    model = MLPClassifier()
    model.fit(train_X, labels)
    pred = model.predict(valid_X)
    print(accuracy_score(labels, pred))

speaker_identification_experiment2(
    dev_speeches[:, :5*SAMPLE_RATE],
    dev_speeches[:, 5*SAMPLE_RATE:])

0.5415605095541401




In [6]:
def speaker_identification_experiment3(train_speeches, val_speeches):
    
    train_feats = preprocess_data(train_speeches)  # (n_speaker, n_mfcc, n_frame)
    n_speaker = train_feats.shape[0]
    n_mfcc = train_feats.shape[1]
    n_frame = train_feats.shape[2]
    n_per_sample = 3
    n_sample_per_speaker = 50
    assert n_per_sample * n_sample_per_speaker < n_frame, f"n_frame={n_frame} is too small."
    train_X = np.swapaxes(train_feats, 1, 2)  # (n_speaker, n_frame, n_mfcc)
    train_X = train_X[:, :n_per_sample * n_sample_per_speaker, :]  # discard the rest
    train_X = train_X.reshape(-1, n_per_sample * n_mfcc)  # (n_speaker * n_frame / n_per_sample, n_per_sample*n_mfcc)
    labels = [speaker for speaker in range(n_speaker) for rep in range(n_sample_per_speaker)]
    
    valid_feats = preprocess_data(val_speeches)
    valid_X = np.swapaxes(valid_feats, 1, 2)
    valid_X = valid_X[:, :n_per_sample * n_sample_per_speaker, :]
    valid_X = valid_X.reshape(-1, n_per_sample * n_mfcc)

    model = MLPClassifier([100])
    model.fit(train_X, labels)
    pred = model.predict(valid_X)
    print(accuracy_score(labels, pred))

speaker_identification_experiment3(
    dev_speeches[:, :5*SAMPLE_RATE],
    dev_speeches[:, 5*SAMPLE_RATE:])  # 4.4s

0.5455


In [7]:
import itertools  

class LSTMClassifier(torch.nn.Module):
    def __init__(self, n_mfcc, n_class):
        super().__init__()
        H = 160
        self.model = torch.nn.LSTM(n_mfcc, hidden_size=H, num_layers=3, batch_first=True)
        self.fc = torch.nn.Linear(H, n_class)

    def forward(self, X):
        #h0 = torch.randn(self.model.hidden_size * self.model.num_layers, len(labels), self.model.input_size)
        #c0 = torch.randn(self.model.hidden_size * self.model.num_layers, len(labels), self.model.input_size)
        
        output, (hn, cn) = self.model(torch.from_numpy(X))
        # output: (N, L, H_out)

        logits = self.fc(output[:, -1, :])
        return logits 

    def fit(self, train_X, labels):
        optimizer = torch.optim.Adam(
            itertools.chain(self.model.parameters(), self.fc.parameters()),
            lr=3e-4
        )
        self.train()
        min_loss = np.inf
        for epoch in range(200):
            logits = self.forward(train_X)
            loss = torch.nn.CrossEntropyLoss()(logits, torch.tensor(labels))

            loss.backward()
            optimizer.step()  # Large batch... all samples into one batch

            if loss < min_loss:
                min_loss = loss.item()
            else:
                print (f"Early stop at epoch {epoch}")
                break 

    def predict(self, test_X):
        with torch.no_grad():
            logits = self.forward(test_X)  # (N, D*H_out)
            probs, predictions = logits.max(dim=-1)  # (N)
        return predictions


def speaker_identification_experiment4(train_speeches, val_speeches):
    
    train_feats = preprocess_data(train_speeches)  # (n_speaker, n_mfcc, n_frame)
    n_speaker = train_feats.shape[0]
    n_mfcc = train_feats.shape[1]
    n_frame = train_feats.shape[2]
    n_per_sample = 3
    n_sample_per_speaker = 50
    assert n_per_sample * n_sample_per_speaker < n_frame, f"n_frame={n_frame} is too small."
    train_X = np.swapaxes(train_feats, 1, 2)  # (n_speaker, n_frame, n_mfcc)
    train_X = train_X[:, :n_per_sample * n_sample_per_speaker, :]  # discard the rest
    train_X = train_X.reshape(-1, n_per_sample, n_mfcc)  # (n_speaker, n_frame / n_per_sample, n_per_sample*n_mfcc)
    labels = [speaker for speaker in range(n_speaker) for rep in range(n_sample_per_speaker)]
    
    valid_feats = preprocess_data(val_speeches)
    valid_X = np.swapaxes(valid_feats, 1, 2)
    valid_X = valid_X[:, :n_per_sample * n_sample_per_speaker, :]
    valid_X = valid_X.reshape(-1, n_per_sample, n_mfcc)

    model = LSTMClassifier(n_mfcc, n_speaker)
    
    model.fit(train_X, labels)
    pred = model.predict(valid_X)
    print(accuracy_score(labels, pred))

speaker_identification_experiment4(
    dev_speeches[:, :5*SAMPLE_RATE],
    dev_speeches[:, 5*SAMPLE_RATE:])  # 9.3s

Early stop at epoch 73
0.192


In [8]:
train_speeches = get_audio_signals("train-clean-100")  # (N_speaker, CLIP_TIME * SAMPLE_RATE)
speaker_identification_experiment2(
    train_speeches[:, :5*SAMPLE_RATE],
    train_speeches[:, 5*SAMPLE_RATE:])  # 2 mins, 45 seconds

0.30794021366762253


