In [16]:
import pandas as pd
from konlpy.tag import Kkma
from tqdm import tqdm 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from wer import *
import pickle as pkl
merge_csv = pd.read_csv("merge_output.csv", encoding = 'utf8')

In [17]:
kkma = Kkma()
file_list = merge_csv.iloc[:, 0].to_list()
input_text = merge_csv.iloc[:, 1].to_list()
output_text = merge_csv.iloc[:, 2].to_list()

In [18]:
input_text[110:120]

['쿠폰과 다른 할인 중복 가능한가요',
 '쿠폰으로 중복 할인이 되나요',
 '초보 운전인데 주차하기 어려운 거 드릴까요',
 '토요일 점심 세 명 예약 할 수 있나요',
 '메뉴도 예약할 수 있나요',
 '런치 타임에도 예약을 받으시나요',
 '주말 예약이 가능한가요',
 '1시 예약 가능한가요',
 '저녁 7시에 가려고 하는데요 여덟 명 예약 가능한가요',
 '예약하고 가야 하나요']

In [19]:
output_text[110:120]

['쿠폰과 다른 할인 중복가능한가요',
 '쿠폰으로 중복할인이 되나요?',
 '초보운전인데 주차하기 어려운 구조일까요?',
 '토요일 점심 3명 예약할 수 있나요?',
 '메뉴도 예약할 수 있나요?',
 '런치타임에도 예약을 받으시나요?',
 '주말 예약이 가능한가요?',
 '한 시 예약 가능한가요?',
 '저녁 7시에 가려고 하는데요, 8명 예약가능한가요?',
 '예약하고 가야하나요?']

In [20]:
import re
def make_only_text(sentence):
    return re.compile('[^ㄱ-ㅎ|ㅏ-ㅣ|가-힣|\s|a-z|A-Z|0-9]+').sub('', sentence)

In [21]:
print(output_text[14462])
print(kkma.morphs(make_only_text(output_text[14462])))

장애인 주차 구역이 따로 마련되어 있나요?
['장애인', '주차', '구역', '이', '따로', '마련', '되', '어', '있', '나요']


In [22]:
print(len(input_text))

60867


In [23]:
inp_text = []
oup_text = []
file_name = []
inp_vocab = set()
oup_vocab = set()
i=0
for inp_sent, oup_sent in tqdm(zip(input_text, output_text)):
    
    if len(inp_sent) <= 5 or inp_sent =="('None text', 0)":
        continue
    file_name.append('./voice_signal/' + file_list[i][:-3] + 'pkl')
    i+=1
    inp_sent = kkma.morphs(make_only_text(inp_sent))
    oup_sent = kkma.morphs(make_only_text(oup_sent))
    inp_text.append(inp_sent)
    oup_text.append(oup_sent)
    inp_vocab.update(inp_sent)
    oup_vocab.update(oup_sent)

60867it [00:00, 132069.43it/s]


In [24]:
print(len(inp_text))

60438


In [25]:
inp_vocab = ['<p>', '<s>', '<e>'] + list(inp_vocab)
oup_vocab = ['<p>', '<s>', '<e>'] + list(oup_vocab)

In [26]:
inp_num2char = {i:word for i,word in enumerate(inp_vocab)}
inp_char2num = {word:i for i,word in enumerate(inp_vocab)}

oup_num2char = {i:word for i,word in enumerate(oup_vocab)}
oup_char2num = {word:i for i,word in enumerate(oup_vocab)}

In [27]:
inp_char2num['<s>']

1

In [28]:
max_len = 0
inp_numsent = []
oup_numsent = []

for inp_sent, oup_sent in tqdm(zip(inp_text, oup_text)):
    inp_sent = [inp_char2num['<s>']] + [inp_char2num[word] for word in inp_sent] + [inp_char2num['<e>']]
    oup_sent = [oup_char2num['<s>']] + [oup_char2num[word] for word in oup_sent] + [oup_char2num["<e>"]]
    
    inp_numsent.append(inp_sent)
    oup_numsent.append(oup_sent)
    
    step_max_len = len(inp_sent) if len(inp_sent) > len(oup_sent) else len(oup_sent)
    max_len = max_len if max_len > step_max_len  else step_max_len
    
inp_numsent = pad_sequences(inp_numsent, max_len, padding='post')
oup_numsent = pad_sequences(oup_numsent, max_len, padding = 'post')

60438it [00:00, 171734.92it/s]


In [64]:
!unzip voice_signal/c.zip

unzip:  cannot find or open voice_signal/c.zip, voice_signal/c.zip.zip or voice_signal/c.zip.ZIP.


In [29]:
pkl_file = []
for file in file_name:
    with open(file, 'rb') as f:
        pkl_file.append(pkl.load(f))

In [None]:
train_input, dev_input, train_output, dev_output = train_test_split(inp_numsent, oup_numsent, test_size = 0.2, random_state = 255)
val_input, test_input, val_output, test_output = train_test_split(dev_input, dev_output, test_size = 0.5, random_state = 255)
test_input = [[inp_num2char[word] for word in sentence if inp_num2char[word] not in ['<p>', '<e>', '<s>']] for sentence in test_input]
test_output = [[oup_num2char[word] for word in sentence if oup_num2char[word] not in ['<p>', '<e>', '<s>']] for sentence in test_output]
wer_score = 0
for inp_sent, tar_sent in zip(test_input, test_output):
    wer_score += wer(inp_sent, tar_sent)
print(i, wer_score/len(test_input))

In [58]:
print(test_input[2])
print(test_output[2])

['주소', '는', '마포구', '와우', '산로', '22', '길', '20', '28', '이', 'ㅂ니다']
['주소', '는', '마포구', '와우', '산로', '22', '길', '2028', '이', 'ㅂ니다']


In [30]:
train_input, dev_input, train_output, dev_output = train_test_split(inp_numsent, oup_numsent, test_size = 0.2, random_state = 255)
val_input, test_input, val_output, test_output = train_test_split(dev_input, dev_output, test_size = 0.5, random_state = 255)

train_voice, dev_voice = train_test_split(pkl_file, test_size = 0.2, random_state = 255)
val_voice, test_voice = train_test_split(dev_voice, test_size = 0.5, random_state = 255)

In [66]:
with open('./preprocessed_data/train_text_input.pkl', 'wb') as f:
    pkl.dump(train_input, f)
with open('./preprocessed_data/train_text_output.pkl', 'wb') as f:
    pkl.dump(train_output, f)
with open('./preprocessed_data/val_text_input.pkl', 'wb') as f:
    pkl.dump(val_input, f)
with open('./preprocessed_data/val_text_output.pkl', 'wb') as f:
    pkl.dump(val_output, f)
with open('./preprocessed_data/test_text_input.pkl', 'wb') as f:
    pkl.dump(test_input, f)
with open('./preprocessed_data/test_text_output.pkl', 'wb') as f:
    pkl.dump(test_output, f)
with open('./preprocessed_data/train_voice_input.pkl', 'wb') as f:
    pkl.dump(train_voice, f)
with open('./preprocessed_data/val_voice_input.pkl', 'wb') as f:
    pkl.dump(val_voice, f)
with open('./preprocessed_data/test_voice_input.pkl', 'wb') as f:
    pkl.dump(test_voice, f)
    
with open('./preprocessed_data/inp_num2char.pkl', 'wb') as f:
    pkl.dump(inp_num2char, f)
with open('./preprocessed_data/inp_char2num.pkl', 'wb') as f:
    pkl.dump(inp_char2num, f)
with open('./preprocessed_data/oup_num2char.pkl', 'wb') as f:
    pkl.dump(oup_num2char, f)
with open('./preprocessed_data/oup_char2num.pkl', 'wb') as f:
    pkl.dump(oup_char2num, f)
