
**GCC-PHAT (generalized cross correlation Phase Transform) 기본 원리**

1. 투 채널을 가정했을 때, 두 개의 마이크에서 받은 신호들을 각각 푸리에 변환
2. 주파수 도메인 상에서, 두 신호의 cross power spectrum을 구함
3. phase transform을 적용해 정규화하여 phase 정보만 남김
4. inverse fourier transform해서 cross correlation을 얻음
5. cross correlation이 최댓값을 갖는 delay를 찾음
6. TDoA를 아는 상태에서 마이크 간 거리, 소리의 속도 등 마이크 어레이 정보를 통해 DoA를 계산함


In [152]:
import os
import numpy as np
import librosa

### 데이터 준비

In [153]:
folder = "data"
audios = os.listdir(folder)
# bottom, top
audios.sort()
sr = 48000

sig1, _ = librosa.load(folder + "/" + audios[0], sr=None)
sig2, _ = librosa.load(folder + "/" + audios[1], sr=None)
print(audios)

['hal_in_pure_24_4ch_48k_1.wav', 'hal_in_pure_24_4ch_48k_2.wav']


### 코드 검증

In [154]:
n = sig1.size + sig2.size
print("Time-domain1: \n", sig1, "\n", "Shape:", sig1.shape, "\n")
print("Time-domain2: \n", sig2, "\n", "Shape:", sig2.shape, "\n")
# fourier transform
SIG = np.fft.rfft(sig1, n=n)
REFSIG = np.fft.rfft(sig2, n=n)
print("Frequency-domain1: \n", SIG, "\n", "Shape:", SIG.shape, "\n")
print("Frequency-domain2: \n", REFSIG, "\n", "Shape:", REFSIG.shape, "\n")


Time-domain1: 
 [-1.0519068e-15  2.8072910e-15 -2.4481232e-15 ... -3.1524554e-02
 -3.1538107e-02 -3.1787485e-02] 
 Shape: (1144320,) 

Time-domain2: 
 [-3.2241015e-15  1.9232307e-15 -1.3310781e-15 ... -1.2696562e-02
 -1.2214071e-02 -1.2102935e-02] 
 Shape: (1144320,) 

Frequency-domain1: 
 [-4.68861425e+01+0.j         -6.55555719e+01+1.95728465j
 -4.72056293e+01+4.16960668j ... -4.65667375e-02-0.21333879j
 -1.47556429e-01+0.19701903j  2.62717529e-01+0.j        ] 
 Shape: (1144321,) 

Frequency-domain2: 
 [4.99012647e+01+0.00000000e+00j 5.64952888e+01+3.93542728e+00j
 5.37936012e+01+7.92191056e+00j ... 2.96451238e-03+1.80317502e-02j
 1.46118632e-02+8.21871548e-04j 4.97604851e-03+0.00000000e+00j] 
 Shape: (1144321,) 



In [155]:
# cross spectral density
R = SIG * np.conj(REFSIG)
print("Cross Spectral Density: \n", R, "\n", "Shape:", R.shape, "\n")

Cross Spectral Density: 
 [-2.33967780e+03+0.00000000e+00j -3.69587821e+03+3.68566548e+02j
 -2.50632955e+03+5.98256932e+02j ... -3.98491948e-03+2.07234286e-04j
 -1.99415002e-03+3.00008755e-03j  1.30729517e-03+0.00000000e+00j] 
 Shape: (1144321,) 



In [156]:
cc = np.fft.irfft(R / np.abs(R)).real
print("Cross Correlation: \n", cc, "\n", "Shape:", cc.shape, "\n")
delay_index = np.argmax(cc)
print(delay_index)

Cross Correlation: 
 [ 0.00444497 -0.08132639  0.02893813 ... -0.02335462  0.05767493
 -0.01516702] 
 Shape: (2288640,) 

2288623


### GCC-PHAT 함수 구현

In [157]:
def gcc_phat(sig1, sig2, sr):
    n = sig1.size + sig2.size
    
    SIG1 = np.fft.rfft(sig1, n=n)
    SIG2 = np.fft.rfft(sig2, n=n)
    
    R = SIG1 * np.conj(SIG2)
    cc = np.fft.irfft(R / np.abs(R)).real
    
    # cross-correlation = sample delay
    # TDoA = sample delay / sampling rate
    delay_index = np.argmax(cc)
    if delay_index > n // 2:
        sample_delay = (n-1) - delay_index
    tdoa = sample_delay / sr

    return tdoa

# tdoa = cos(doa)* distance / sound_speed
tdoa = gcc_phat(sig1, sig2, sr)
distance = 0.157
sound_speed = 343.0
doa = np.arccos(tdoa * sound_speed / distance)
print(f"TDoA: {tdoa}")
print(f"DoA: {doa}")
print(f"Azimuth: {np.degrees(doa)}")

TDoA: 0.0003333333333333333
DoA: 0.7550492498547786
Azimuth: 43.2611353411976
