In [None]:
# drive mount. colab에 내 구글 드라이브 연결
from google.colab import drive
drive.mount('/content/drive')

# import_ipynb module 설치
!pip install import_ipynb

# import를 위한 경로이동
%cd /content/drive/MyDrive/team_malmungchi/colab/speaker_verification/code
!ls

In [None]:
import os
import sys
import tensorflow as tf
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import SGD
import numpy as np
import pickle
import IPython.display as ipd

#sys.path.append("/content/drive/MyDrive/team_malmungchi/colab/speaker_verification/code")
import import_ipynb
from constants import NUM_FBANK, HOP_LENGTH, NUM_SPEC_PER_SPEAKER, NUM_FRAME, NUM_ENROLL_UTT, NUM_TRUE_UTT, NUM_FALSE_UTT
import batcher
import network
import loss
import fitter
from utils import load_best_checkpoint, normalize, cosine_similarity
from test_eer import test_frame, test_utt
from eval_metrics import calculate_eer

# eager execution 사용. test_step에서 eer 계산에 numpy를 사용하기 때문
tf.config.run_functions_eagerly(True)

importing Jupyter notebook from constants.ipynb
importing Jupyter notebook from batcher.ipynb
importing Jupyter notebook from network.ipynb
importing Jupyter notebook from eval_metrics.ipynb
importing Jupyter notebook from utils.ipynb
importing Jupyter notebook from loss.ipynb
importing Jupyter notebook from fitter.ipynb
importing Jupyter notebook from test_eer.ipynb


In [None]:
import librosa

def pcm2npy(file_path):
    # https://kaen2891.tistory.com/107
    with open(file_path, 'rb') as opened_pcm_file:
        buf = opened_pcm_file.read()
        pcm = np.frombuffer(buf, dtype = 'int16')
        wav = librosa.util.buf_to_float(pcm, 2)
    return wav

def trim_silence(wav):
    energy = np.abs(wav)
    silence_threshold = np.percentile(energy, 95)
    offsets = np.where(energy > silence_threshold)[0]
    # librosa.trim같은 방법도 있음. https://malaya-speech.readthedocs.io/en/latest/remove-silent-vad.html
    wav_trimed = wav[offsets[0]:offsets[-1]]
    return wav_trimed

def npy2spec(wav, sr=44100): # return: [time][freq][ch]
    # TODO : 스펙트로그램 파라미터를 아두이노에서 사용할 c++ library와 같게 맞춰야한다.

    # sr조심!!!!!!!!!!!!!!!!!
    hop = int((HOP_LENGTH/16000)*sr)
    spec = librosa.power_to_db(librosa.feature.melspectrogram(y=wav, sr=sr, n_fft=NUM_FBANK, hop_length=hop)) # log mel spectrogram을 생성. n_mels=128이 default값. [freq(128)][time(128)] 
    """
    delta1 = librosa.feature.delta(data=mel, width=5)
    delta2 = librosa.feature.delta(data=delta1, width=5)
    stack = np.dstack((mel,delta1,delta2)) #-> (128,t,3)
    """
    spec = spec.T  # [t][f]
    spec_normalized = normalize_frames(spec)  # normalize
    spec_normalized = np.expand_dims(spec_normalized, axis=-1) # [time(128)][freq(128)][chnnel(1)]
    return spec_normalized

def normalize_frames(m, epsilon=1e-12):
    return [(v - np.mean(v)) / max(np.std(v), epsilon) for v in m]

# 음성 하나를 spectrogram으로 변환
def convert(file_path, trim=False):
    if file_path[-3:]=='pcm': wav = pcm2npy(file_path); sr=16000
    else:
      wav,sr = librosa.load(path=file_path)
      wav = librosa.resample(wav, sr, 16000) #downsampling
      sr = 16000
    if trim: wav = trim_silence(wav)
    spec = npy2spec(wav, sr)
    return spec 

In [None]:
 def get_thres(Model):
  print('frame level eer')
  val_dataset_path = '../data/dataset/test_340_25_128_512.npy'
  with open(val_dataset_path,"rb") as f: batchs = pickle.load(f)
  
  num_batch = batchs.shape[0]
  BATCH_SIZE = NUM_ENROLL_UTT + NUM_TRUE_UTT + NUM_FALSE_UTT
  true_score = []   # true 발화의 점수를 저장할 배열
  false_score = []  # false 발화의 점수를 저장할 배열

  batchs = np.concatenate(batchs)
  pred = Model.predict(batchs)
  pred = normalize(pred)
  del batchs

  for i in range(num_batch):
    enroll = pred[i*BATCH_SIZE:i*BATCH_SIZE+NUM_ENROLL_UTT]
    enroll = normalize(np.mean(enroll, axis=0)) # 등록 발화를 평균

    for j in range(NUM_TRUE_UTT): true_score.append(cosine_similarity(enroll,pred[i*BATCH_SIZE+NUM_ENROLL_UTT+j]))
    for j in range(NUM_FALSE_UTT): false_score.append(cosine_similarity(enroll,pred[i*BATCH_SIZE+NUM_ENROLL_UTT+NUM_TRUE_UTT+j]))
  
  # calculate_eer함수의 인자로 알맞은 형태로 변환. true의 label에 1 표시해주기
  scores = np.array(true_score+false_score)
  labels = np.array([1.0]*len(true_score)+[0.0]*len(false_score))

  # eer 계산
  eer,thres = calculate_eer(np.arange(0, 1.0, 0.001), scores, labels, get_threshold=True)
  print('EER:', eer)
  print('threshold:', thres)


  print('utt level eer')
  # load data
  dataset_path = '../data/dataset/testUtt_201_25.pickle'
  with open(dataset_path,"rb") as f: batchs = pickle.load(f)
  
  true_score = []   # true 발화의 점수를 저장할 배열
  false_score = []  # false 발화의 점수를 저장할 배열

  for batch in batchs:
    d_vectors = []
    for utt in batch:
      specs = split_spec(utt)
      pred = Model.predict(specs)
      pred = normalize(pred)
      d_vector = normalize(np.mean(pred, axis=0))
      d_vectors.append(d_vector)
    
    enroll = d_vectors[:NUM_ENROLL_UTT]
    enroll = normalize(np.mean(enroll, axis=0)) # 등록 발화를 평균

    for j in range(NUM_TRUE_UTT): true_score.append(cosine_similarity(enroll,d_vectors[NUM_ENROLL_UTT+j]))
    for j in range(NUM_FALSE_UTT): false_score.append(cosine_similarity(enroll,d_vectors[NUM_ENROLL_UTT+NUM_TRUE_UTT+j]))
  
  # calculate_eer함수의 인자로 알맞은 형태로 변환. true의 label에 1 표시해주기
  scores = np.array(true_score+false_score)
  labels = np.array([1.0]*len(true_score)+[0.0]*len(false_score))

  # eer 계산
  eer,thres = calculate_eer(np.arange(0, 1.0, 0.001), scores, labels, get_threshold=True)
  print('EER:', eer)
  print('threshold:', thres)

 


def split_spec(spec):
  splited = []
  for i in range(0,spec.shape[0]-NUM_FRAME+1,NUM_FRAME//2): splited.append(spec[i:i+NUM_FRAME,:])
  return np.array(splited)



def eval_one_to_one(model_name, checkpoint_dir, input_dir, N, M):
  Model = network.get_network(model_name)

  best_ckpt = load_best_checkpoint(checkpoint_dir)
  Model.load_weights(checkpoint_dir+'/'+best_ckpt)

  #get_thres(Model)


  utts_of_speakers = []
  for speaker in os.listdir(input_dir):
    print(speaker)

    for utt in os.listdir(input_dir+'/'+speaker):
      print(utt)
      spec = convert(input_dir+'/'+speaker+'/'+utt)
      utts_of_speakers.append(spec)


  d_vectors = []
  for utt in utts_of_speakers:
    specs = split_spec(utt)
    pred = Model.predict(specs)
    pred = normalize(pred)
    d_vector = normalize(np.mean(pred, axis=0))
    d_vectors.append(d_vector)
  

  simMat = [[int(cosine_similarity(d_vectors[i],d_vectors[j])*100) for i in range(N*M)] for j in range(N*M)]
  for i in range(N*M): print(simMat[i])

  sim = []
  for i in range(N*M):
    for j in range(N*M):
      sim.append(cosine_similarity(d_vectors[i],d_vectors[j]))
  label = []
  for i in range(N): label.extend(([0]*(M*i)+[1]*M+[0]*(M*(N-1-i)))*M)

  eer,thres = calculate_eer(np.arange(0, 1.0, 0.001), sim, label, get_threshold=True)
  print('EER:', eer)
  print('threshold:', thres)

In [None]:
def enroll_test_one_batch(Model, batch, input_dir):
  # flatten batch
  num_enroll, num_true, num_false = len(batch[0]), len(batch[1]), len(batch[2])
  batch = batch[0]+batch[1]+batch[2]

  # convert to wav
  batch = list(map(lambda x: convert(input_dir+'/'+x), batch))

  # convert to d-vector
  def wav2vec(wav):
    specs = split_spec(wav)
    pred = Model.predict(specs)
    pred = normalize(pred)
    d_vector = normalize(np.mean(pred, axis=0))
    return d_vector
  batch = np.array(list(map(wav2vec, batch)))

  enroll_vecs = batch[:num_enroll]
  test_vecs = batch[num_enroll:]

  enroll_vec = normalize(np.mean(enroll_vecs, axis=0))
  sim_score = np.array(list(map(lambda x:cosine_similarity(enroll_vec,x), test_vecs)))
  label = np.array([1]*num_true+[0]*num_false)

  return sim_score, label



def eval_enroll_test(model_name, checkpoint_dir, input_dir, batch_filenames): # batch = [[[enroll],[true],[false]]] = [[[A_a_1,A_a_2,A_a_3],[B_a_1,B_a_2,B_a_3,C_a_1,C_a_2,C_a_3],[B_b_1,B_b_2,B_b_3,C_b_1,C_b_2,C_b_3, B_c_1,B_c_2,B_c_3,C_c_1,C_c_2,C_c_3]]]
  Model = network.get_network(model_name)

  best_ckpt = load_best_checkpoint(checkpoint_dir)
  Model.load_weights(checkpoint_dir+'/'+best_ckpt)

  #get_thres(Model)

  sim_scores = []
  labels = []
  for batch in batch_filenames:
    sim_score, label = enroll_test_one_batch(Model, batch, input_dir)
    sim_scores.append(sim_score)
    labels.append(label)
  
  # print sim_scores per batch
  for i in range(len(sim_scores)):
    print()
    print('batch:', i)
    j = 0
    while labels[i][j]: j+=1
    print('true utt : ', [int(100*score) for score in sim_scores[i][:j]])
    print('false utt : ', [int(100*score) for score in sim_scores[i][j:]])
  
  # calculate new eer and thres for this batch
  eer,thres = calculate_eer(np.arange(0, 1.0, 0.001), np.concatenate(sim_scores), np.concatenate(labels), get_threshold=True)
  print('EER:', eer)
  print('threshold:', thres)

  # 여러개의 enroll-vec으로 각각 score를 구한 뒤, 평균하는 경우를 위한 코드 (case specific temporary code)
  print()
  print()
  print('multiple enrollment vector case')
  sim_scores = np.array(sim_scores)
  first = np.mean(sim_scores[:3],axis=0)
  second = np.mean(sim_scores[3:6],axis=0)
  third = np.mean(sim_scores[6:],axis=0)
  sim_scores = [first,second,third]
  labels = labels[:3]

  for i in range(len(sim_scores)):
    print()
    print('batch:', i)
    j = 0
    while labels[i][j]: j+=1
    print('true utt : ', [int(100*score) for score in sim_scores[i][:j]])
    print('false utt : ', [int(100*score) for score in sim_scores[i][j:]])
  
  # calculate new eer and thres for this batch
  eer,thres = calculate_eer(np.arange(0, 1.0, 0.001), np.concatenate(sim_scores), np.concatenate(labels), get_threshold=True)
  print('EER:', eer)
  print('threshold:', thres)


In [None]:
from collections import deque

def process_audio(model_name, checkpoint_dir,input_dir,enroll_filenames,wav_filename, thres = 0.96):
  Model = network.get_network(model_name)
  best_ckpt = load_best_checkpoint(checkpoint_dir)
  Model.load_weights(checkpoint_dir+'/'+best_ckpt)

  # convert to wav
  enrolls = list(map(lambda x: convert(input_dir+'/'+x), enroll_filenames))

  # convert to d-vector
  def wav2vec(wav):
    specs = split_spec(wav)
    pred = Model.predict(specs)
    pred = normalize(pred)
    d_vector = normalize(np.mean(pred, axis=0))
    return d_vector
  enrolls = np.array(list(map(wav2vec, enrolls)))
  
  # get average d-vector
  enroll_vec = normalize(np.mean(enrolls, axis=0))

  wav,sr = librosa.load(path=input_dir+'/'+wav_filename)
  #wav = wav[:len(wav)//5] # 일단 1분만 사용
  wav = librosa.resample(wav, sr, 16000) #downsampling
  sr = 16000

  hop = int((HOP_LENGTH/16000)*sr)
  one_wav_len = hop*128-1

  score_Q = deque()
  score_sum = 0
  max_score_Q_len = 5

  detect_Q = deque()
  detect_sum = 0
  max_detect_Q_len = 10

  inf_per_sec = 10
  data_per_inf = int(sr/inf_per_sec)

  is_detected = False
  log = []
  last_detected_time = (0,0)

  total_inf_num = (len(wav)+1-one_wav_len)//data_per_inf

  print('total number of inference:', total_inf_num)
  for i in range(one_wav_len,len(wav)+1,data_per_inf):
    spec = npy2spec(wav[i-one_wav_len:i],sr) #(128,128,1)
    spec = np.expand_dims(spec,axis=0) #(1,128,128,1)

    pred = Model.predict(spec)
    pred = normalize(pred)[0,:] #(1,200) -> (200)

    scores = [cosine_similarity(pred,enroll) for enroll in enrolls]
    score = sum(scores)/len(scores)
    
    score_Q.append(score)
    score_sum+=score
    if len(score_Q)>max_score_Q_len : score_sum -= score_Q.popleft()
    mean_score = score_sum/len(score_Q)

    detect = 1 if mean_score>=thres else 0

    detect_Q.append(detect)
    detect_sum+=detect
    if len(detect_Q)>max_detect_Q_len : detect_sum -= detect_Q.popleft()
    mean_detect = detect_sum/len(detect_Q)

    #print(f'({i/sr:.2f},{score:.2f},{mean_score:.2f},{mean_detect:.2f},{1 if is_detected else 0})', end=' ')

    if is_detected==False and mean_detect>=0.6:
      is_detected = True
      last_detected_time = i-one_wav_len-(len(detect_Q)-1)*data_per_inf
    elif is_detected==True and mean_detect<0.6:
      is_detected = False
      end_time = i-one_wav_len-(len(detect_Q)-1)*data_per_inf
      log.append((last_detected_time/sr,end_time/sr))
      #print('\n\n',last_detected_time/sr,end_time/sr,'\n')
  min_on = 0.5
  min_off = 0.5
  log_edit = [[0,0]]
  for s,e in log:
    if s-log_edit[-1][0]<min_off: log_edit[-1][1]=e; continue
    if log_edit[-1][1]-log_edit[-1][0]<min_on: log_edit.pop()
    log_edit.append([s,e])

  for s,e in log_edit:
    s*=16000
    e*=16000
    ipd.display(ipd.Audio(wav[int(s):int(e)], rate = 16000))
  
  return log_edit

In [None]:
log_edit = process_audio('CRNN','../model/CRNN-simMat_batcher-simMat_loss-931-large', '../data/test2', ['A-1.wav','A-2.wav','A-3.wav'], 'BHandJH3min.wav', 0.96)

total number of inference: 1790


In [None]:
"""
batch_filename = [[['A-a-1.wav','A-a-2.wav','A-a-3.wav'],['B-a-1.wav','B-a-2.wav','B-a-3.wav','C-a-1.wav','C-a-2.wav','C-a-3.wav'],['B-b-1.wav','B-b-2.wav','B-b-3.wav','C-b-1.wav','C-b-2.wav','C-b-3.wav',  'B-c-1.wav','B-c-2.wav','B-c-3.wav','C-c-1.wav','C-c-2.wav','C-c-3.wav']],
                  [['A-b-1.wav','A-b-2.wav','A-b-3.wav'],['B-b-1.wav','B-b-2.wav','B-b-3.wav','C-b-1.wav','C-b-2.wav','C-b-3.wav'],['B-a-1.wav','B-a-2.wav','B-a-3.wav','C-a-1.wav','C-a-2.wav','C-a-3.wav',  'B-c-1.wav','B-c-2.wav','B-c-3.wav','C-c-1.wav','C-c-2.wav','C-c-3.wav']],
                  [['A-c-1.wav','A-c-2.wav','A-c-3.wav'],['B-c-1.wav','B-c-2.wav','B-c-3.wav','C-c-1.wav','C-c-2.wav','C-c-3.wav'],['B-b-1.wav','B-b-2.wav','B-b-3.wav','C-b-1.wav','C-b-2.wav','C-b-3.wav',  'B-a-1.wav','B-a-2.wav','B-a-3.wav','C-a-1.wav','C-a-2.wav','C-a-3.wav']]
                  ]

batch_filename = [[['A-a-1.wav'],['B-a-1.wav','B-a-2.wav','B-a-3.wav','C-a-1.wav','C-a-2.wav','C-a-3.wav'],['B-b-1.wav','B-b-2.wav','B-b-3.wav','C-b-1.wav','C-b-2.wav','C-b-3.wav',  'B-c-1.wav','B-c-2.wav','B-c-3.wav','C-c-1.wav','C-c-2.wav','C-c-3.wav']],
                  [['A-a-2.wav'],['B-a-1.wav','B-a-2.wav','B-a-3.wav','C-a-1.wav','C-a-2.wav','C-a-3.wav'],['B-b-1.wav','B-b-2.wav','B-b-3.wav','C-b-1.wav','C-b-2.wav','C-b-3.wav',  'B-c-1.wav','B-c-2.wav','B-c-3.wav','C-c-1.wav','C-c-2.wav','C-c-3.wav']],
                  [['A-a-3.wav'],['B-a-1.wav','B-a-2.wav','B-a-3.wav','C-a-1.wav','C-a-2.wav','C-a-3.wav'],['B-b-1.wav','B-b-2.wav','B-b-3.wav','C-b-1.wav','C-b-2.wav','C-b-3.wav',  'B-c-1.wav','B-c-2.wav','B-c-3.wav','C-c-1.wav','C-c-2.wav','C-c-3.wav']],
                  [['A-b-1.wav'],['B-b-1.wav','B-b-2.wav','B-b-3.wav','C-b-1.wav','C-b-2.wav','C-b-3.wav'],['B-a-1.wav','B-a-2.wav','B-a-3.wav','C-a-1.wav','C-a-2.wav','C-a-3.wav',  'B-c-1.wav','B-c-2.wav','B-c-3.wav','C-c-1.wav','C-c-2.wav','C-c-3.wav']],
                  [['A-b-2.wav'],['B-b-1.wav','B-b-2.wav','B-b-3.wav','C-b-1.wav','C-b-2.wav','C-b-3.wav'],['B-a-1.wav','B-a-2.wav','B-a-3.wav','C-a-1.wav','C-a-2.wav','C-a-3.wav',  'B-c-1.wav','B-c-2.wav','B-c-3.wav','C-c-1.wav','C-c-2.wav','C-c-3.wav']],
                  [['A-b-3.wav'],['B-b-1.wav','B-b-2.wav','B-b-3.wav','C-b-1.wav','C-b-2.wav','C-b-3.wav'],['B-a-1.wav','B-a-2.wav','B-a-3.wav','C-a-1.wav','C-a-2.wav','C-a-3.wav',  'B-c-1.wav','B-c-2.wav','B-c-3.wav','C-c-1.wav','C-c-2.wav','C-c-3.wav']],
                  [['A-c-1.wav'],['B-c-1.wav','B-c-2.wav','B-c-3.wav','C-c-1.wav','C-c-2.wav','C-c-3.wav'],['B-b-1.wav','B-b-2.wav','B-b-3.wav','C-b-1.wav','C-b-2.wav','C-b-3.wav',  'B-a-1.wav','B-a-2.wav','B-a-3.wav','C-a-1.wav','C-a-2.wav','C-a-3.wav']],
                  [['A-c-2.wav'],['B-c-1.wav','B-c-2.wav','B-c-3.wav','C-c-1.wav','C-c-2.wav','C-c-3.wav'],['B-b-1.wav','B-b-2.wav','B-b-3.wav','C-b-1.wav','C-b-2.wav','C-b-3.wav',  'B-a-1.wav','B-a-2.wav','B-a-3.wav','C-a-1.wav','C-a-2.wav','C-a-3.wav']],
                  [['A-c-3.wav'],['B-c-1.wav','B-c-2.wav','B-c-3.wav','C-c-1.wav','C-c-2.wav','C-c-3.wav'],['B-b-1.wav','B-b-2.wav','B-b-3.wav','C-b-1.wav','C-b-2.wav','C-b-3.wav',  'B-a-1.wav','B-a-2.wav','B-a-3.wav','C-a-1.wav','C-a-2.wav','C-a-3.wav']]
                  ]

eval_enroll_test('CRNN','../model/CRNN-simMat_batcher-simMat_loss-931-large', '../data/test_sample', batch_filename)
"""

"\nbatch_filename = [[['A-a-1.wav','A-a-2.wav','A-a-3.wav'],['B-a-1.wav','B-a-2.wav','B-a-3.wav','C-a-1.wav','C-a-2.wav','C-a-3.wav'],['B-b-1.wav','B-b-2.wav','B-b-3.wav','C-b-1.wav','C-b-2.wav','C-b-3.wav',  'B-c-1.wav','B-c-2.wav','B-c-3.wav','C-c-1.wav','C-c-2.wav','C-c-3.wav']],\n                  [['A-b-1.wav','A-b-2.wav','A-b-3.wav'],['B-b-1.wav','B-b-2.wav','B-b-3.wav','C-b-1.wav','C-b-2.wav','C-b-3.wav'],['B-a-1.wav','B-a-2.wav','B-a-3.wav','C-a-1.wav','C-a-2.wav','C-a-3.wav',  'B-c-1.wav','B-c-2.wav','B-c-3.wav','C-c-1.wav','C-c-2.wav','C-c-3.wav']],\n                  [['A-c-1.wav','A-c-2.wav','A-c-3.wav'],['B-c-1.wav','B-c-2.wav','B-c-3.wav','C-c-1.wav','C-c-2.wav','C-c-3.wav'],['B-b-1.wav','B-b-2.wav','B-b-3.wav','C-b-1.wav','C-b-2.wav','C-b-3.wav',  'B-a-1.wav','B-a-2.wav','B-a-3.wav','C-a-1.wav','C-a-2.wav','C-a-3.wav']]\n                  ]\n\nbatch_filename = [[['A-a-1.wav'],['B-a-1.wav','B-a-2.wav','B-a-3.wav','C-a-1.wav','C-a-2.wav','C-a-3.wav'],['B-b-1.wav','B-b-2.wa

In [None]:
#eva_one_to_onel('CRNN','../model/CRNN-simMat_batcher-simMat_loss-931-large', '../data/sample', N=6, M=5)