In [1]:
audiopth = "곰세마리.wav"
label = "곰 세마리가 한집에 있어 아빠곰 엄마곰 애기곰 아빠곰은 뚱뚱해 엄마 곰은 날"

In [2]:
import sys,os
current_dir = os.getcwd()
sys.path.append(os.path.dirname(os.path.abspath(current_dir)))
from utils.Audio import PrintAudioInfo, GetAudio, remove_Silence
from utils.levenshtein_distance import infer
from transformers import Wav2Vec2ForCTC, Wav2Vec2Config
from transformers import Wav2Vec2Processor
import torch
import numpy as np

# model_id = 'hongseongpil/wav2vec2-vocals'
# model = Wav2Vec2ForCTC.from_pretrained(model_id,output_attentions=True)
# model.eval()
# processor = Wav2Vec2Processor.from_pretrained(model_id)

model_config = Wav2Vec2Config.from_json_file(os.path.join(current_dir,'Model','model_config.json'))
model = Wav2Vec2ForCTC(model_config)
model_dict = torch.load(os.path.join(current_dir,'Model','model_state.pt'))
model.load_state_dict(model_dict)
model.eval()
processor = Wav2Vec2Processor.from_pretrained(os.path.join(current_dir,'Model','processor_config'))


from utils.Tokenize_Kor import decompose_tokens
audio = remove_Silence(GetAudio(audiopth))[:20*1000]
input = processor(np.array(audio.set_channels(1).get_array_of_samples(),dtype=np.float32), sampling_rate=16000, return_tensors="pt").input_values[0]
with torch.no_grad():
    input_values = torch.tensor(input).unsqueeze(0)
    logits = model(input_values).logits

predlogits = torch.argmax(logits, dim=-1)[0]
outputs = processor.decode(predlogits,output_char_offsets=True)

decomponsed = decompose_tokens(label)
while ' ' in decomponsed[0]:
    index = decomponsed[0].index(' ')
    del decomponsed[0][index]
    del decomponsed[1][index]

origintext = "".join(decomponsed[0])
predtext = outputs['text'].replace(' ','')
time_offset = model.config.inputs_to_logits_ratio / processor.feature_extractor.sampling_rate

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  input_values = torch.tensor(input).unsqueeze(0)


In [4]:
infered = infer(origintext,predtext)
print("Matching....\n")

for item in infered:
    pred ="".join([predtext[j-1] for j in item[1]])
    print("origintext:", origintext[item[0]-1],"predtext:",pred)

Matching....

origintext: ㄱ predtext: ㄱ
origintext: ㅗ predtext: ㅓㅇㅓ
origintext: ㅁ predtext: ㄴ
origintext: ㅅ predtext: ㅅ
origintext: ㅔ predtext: ㅔㅇ
origintext: ㅁ predtext: ㅁ
origintext: ㅏ predtext: ㅏㅇㅏ
origintext: ㄹ predtext: ㄹ
origintext: ㅣ predtext: ㅣㅇㅣ
origintext: ㄱ predtext: ㄱ
origintext: ㅏ predtext: ㅏㅇㅏ
origintext: ㅎ predtext: ㅎ
origintext: ㅏ predtext: ㅏㅇㅏ
origintext: ㄴ predtext: ㅇ
origintext: ㅈ predtext: ㅈ
origintext: ㅣ predtext: ㅣ
origintext: ㅂ predtext: ㅂ
origintext: ㅔ predtext: ㅔ
origintext: ㅇ predtext: ㅇ
origintext: ㅣ predtext: ㅣ
origintext: ㅆ predtext: ㅆ
origintext: ㅓ predtext: ㅓ
origintext: ㅇ predtext: ㅇ
origintext: ㅏ predtext: ㅏㅇㅏ
origintext: ㅃ predtext: ㅃ
origintext: ㅏ predtext: ㅏ
origintext: ㄱ predtext: ㄱ
origintext: ㅗ predtext: ㅗㅇㅗ
origintext: ㅁ predtext: ㅁ
origintext: ㅇ predtext: ㅇ
origintext: ㅓ predtext: ㅓ
origintext: ㅁ predtext: ㅁ
origintext: ㅁ predtext: ㅁ
origintext: ㅏ predtext: ㅏ
origintext: ㄱ predtext: ㄱ
origintext: ㅗ predtext: ㅗㅇㅗ
origintext: ㅁ predtext: ㅁ
origint

In [5]:
from collections import Counter
count_dict = dict(Counter(decomponsed[1]))
count_list = [[num, count] for num, count in count_dict.items()]
result = []
phoneme_index = 0
for i in count_list:
    start_index = infered[phoneme_index][1][0]-1
    end_index = infered[phoneme_index+i[1]-1][1][-1]-1
    phoneme_index += i[1]
    start_offset = outputs['char_offsets'][start_index]["start_offset"]
    end_offset = outputs['char_offsets'][end_index]["end_offset"]
    result.append({'origin' : label[i[0]],'start':round (start_offset* time_offset, 2 ),'end' :round (end_offset* time_offset, 2 )})
    
result

[{'origin': '곰', 'start': 7.26, 'end': 7.54},
 {'origin': '세', 'start': 7.68, 'end': 7.78},
 {'origin': '마', 'start': 7.92, 'end': 8.08},
 {'origin': '리', 'start': 8.18, 'end': 8.52},
 {'origin': '가', 'start': 8.64, 'end': 8.94},
 {'origin': '한', 'start': 9.0, 'end': 9.36},
 {'origin': '집', 'start': 9.5, 'end': 9.58},
 {'origin': '에', 'start': 9.74, 'end': 9.82},
 {'origin': '있', 'start': 9.82, 'end': 10.14},
 {'origin': '어', 'start': 10.42, 'end': 10.5},
 {'origin': '아', 'start': 10.5, 'end': 10.94},
 {'origin': '빠', 'start': 11.08, 'end': 11.2},
 {'origin': '곰', 'start': 11.34, 'end': 11.6},
 {'origin': '엄', 'start': 11.8, 'end': 12.02},
 {'origin': '마', 'start': 12.0, 'end': 12.12},
 {'origin': '곰', 'start': 12.24, 'end': 12.5},
 {'origin': '애', 'start': 12.72, 'end': 13.04},
 {'origin': '기', 'start': 13.2, 'end': 13.5},
 {'origin': '곰', 'start': 13.62, 'end': 14.1},
 {'origin': '아', 'start': 14.5, 'end': 14.68},
 {'origin': '빠', 'start': 14.96, 'end': 15.3},
 {'origin': '곰', 'start

In [6]:
for w in result:
    start = w['start']
    end = w['end']
    print(w['origin'],w['start'],w['end'])
    PrintAudioInfo(audio[start*1000:end*1000])

곰 7.26 7.54
Channels: 1
Sample rate: 16000
Duration:  0.28
Bit depth: 2 bits
len samples: 4480


세 7.68 7.78
Channels: 1
Sample rate: 16000
Duration:  0.1
Bit depth: 2 bits
len samples: 1600


마 7.92 8.08
Channels: 1
Sample rate: 16000
Duration:  0.16
Bit depth: 2 bits
len samples: 2560


리 8.18 8.52
Channels: 1
Sample rate: 16000
Duration:  0.34
Bit depth: 2 bits
len samples: 5440


가 8.64 8.94
Channels: 1
Sample rate: 16000
Duration:  0.3
Bit depth: 2 bits
len samples: 4800


한 9.0 9.36
Channels: 1
Sample rate: 16000
Duration:  0.36
Bit depth: 2 bits
len samples: 5760


집 9.5 9.58
Channels: 1
Sample rate: 16000
Duration:  0.08
Bit depth: 2 bits
len samples: 1280


에 9.74 9.82
Channels: 1
Sample rate: 16000
Duration:  0.08
Bit depth: 2 bits
len samples: 1280


있 9.82 10.14
Channels: 1
Sample rate: 16000
Duration:  0.32
Bit depth: 2 bits
len samples: 5120


어 10.42 10.5
Channels: 1
Sample rate: 16000
Duration:  0.08
Bit depth: 2 bits
len samples: 1280


아 10.5 10.94
Channels: 1
Sample rate: 16000
Duration:  0.44
Bit depth: 2 bits
len samples: 7040


빠 11.08 11.2
Channels: 1
Sample rate: 16000
Duration:  0.12
Bit depth: 2 bits
len samples: 1920


곰 11.34 11.6
Channels: 1
Sample rate: 16000
Duration:  0.26
Bit depth: 2 bits
len samples: 4160


엄 11.8 12.02
Channels: 1
Sample rate: 16000
Duration:  0.22
Bit depth: 2 bits
len samples: 3520


마 12.0 12.12
Channels: 1
Sample rate: 16000
Duration:  0.12
Bit depth: 2 bits
len samples: 1920


곰 12.24 12.5
Channels: 1
Sample rate: 16000
Duration:  0.26
Bit depth: 2 bits
len samples: 4160


애 12.72 13.04
Channels: 1
Sample rate: 16000
Duration:  0.32
Bit depth: 2 bits
len samples: 5120


기 13.2 13.5
Channels: 1
Sample rate: 16000
Duration:  0.3
Bit depth: 2 bits
len samples: 4800


곰 13.62 14.1
Channels: 1
Sample rate: 16000
Duration:  0.48
Bit depth: 2 bits
len samples: 7680


아 14.5 14.68
Channels: 1
Sample rate: 16000
Duration:  0.18
Bit depth: 2 bits
len samples: 2880


빠 14.96 15.3
Channels: 1
Sample rate: 16000
Duration:  0.34
Bit depth: 2 bits
len samples: 5440


곰 15.42 15.8
Channels: 1
Sample rate: 16000
Duration:  0.38
Bit depth: 2 bits
len samples: 6080


은 15.88 16.16
Channels: 1
Sample rate: 16000
Duration:  0.28
Bit depth: 2 bits
len samples: 4480


뚱 16.34 16.62
Channels: 1
Sample rate: 16000
Duration:  0.28
Bit depth: 2 bits
len samples: 4480


뚱 16.82 17.14
Channels: 1
Sample rate: 16000
Duration:  0.32
Bit depth: 2 bits
len samples: 5120


해 17.2 17.3
Channels: 1
Sample rate: 16000
Duration:  0.1
Bit depth: 2 bits
len samples: 1600


엄 17.3 18.42
Channels: 1
Sample rate: 16000
Duration:  1.12
Bit depth: 2 bits
len samples: 17920


마 18.58 18.92
Channels: 1
Sample rate: 16000
Duration:  0.34
Bit depth: 2 bits
len samples: 5440


곰 19.06 19.42
Channels: 1
Sample rate: 16000
Duration:  0.36
Bit depth: 2 bits
len samples: 5760


은 19.5 19.8
Channels: 1
Sample rate: 16000
Duration:  0.3
Bit depth: 2 bits
len samples: 4800


날 19.9 19.94
Channels: 1
Sample rate: 16000
Duration:  0.04
Bit depth: 2 bits
len samples: 640
