<a href="https://colab.research.google.com/github/Yewon9/STT_JEJU/blob/main/Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# IMPORT

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
import json
import pandas as pd
import os
from tqdm import tqdm

# DATA

## wav 기준 df 생성

In [12]:
wav_dirs = ['/content/drive/MyDrive/JEJU/data/wav/TS_02', '/content/drive/MyDrive/JEJU/data/wav/VL_01']
json_dirs = ['/content/drive/MyDrive/JEJU/data/json/TL_02', '/content/drive/MyDrive/JEJU/data/json/VL_01']

In [13]:
def extract_dialect_and_standard(json_path):
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # 방언 텍스트 (value)
    dialect_text = data['script']['value']

    # 표준어 텍스트 생성
    standard_text_list = []
    for segment in data['transcription']['segments']:
        if segment.get('standard'):  # 표준어가 있으면 사용
            standard_text_list.append(segment['standard'])
        else:  # null 값이면 방언 그대로 사용
            standard_text_list.append(segment['dialect'])

    # 표준어로 변환된 텍스트
    standard_text = ' '.join(standard_text_list)

    return dialect_text, standard_text

In [14]:
data_list = []

In [None]:
for wav_dir, json_dir in zip(wav_dirs, json_dirs):
    # wav 파일 목록을 tqdm으로 감싸서 진행률 표시
    for wav_file in tqdm(os.listdir(wav_dir), desc=f"Processing {wav_dir}"):
        if wav_file.endswith(".wav"):
            wav_file_name = os.path.splitext(wav_file)[0]
            json_file = f"{wav_file_name}.json"
            json_path = os.path.join(json_dir, json_file)
            wav_path = os.path.join(wav_dir, wav_file)

            # json 파일이 존재하는 경우만 처리
            if os.path.exists(json_path):
                # 방언과 표준어 텍스트 추출
                dialect_text, standard_text = extract_dialect_and_standard(json_path)

                # DataFrame에 추가
                data_list.append({"Path": wav_path, "Text": dialect_text, "Standard Text": standard_text})

Processing /content/drive/MyDrive/JEJU/data/wav/TS_02:  40%|████      | 3155/7844 [10:09<20:46,  3.76it/s]

In [None]:
df = pd.DataFrame(data_list)
df

## Text, Standard Text 비교

In [None]:
df_filtered = df[df['Text'] == df['Standard Text']]
df_filtered

## 실제 음성 파일 존재 여부 확인

In [None]:
df_filtered = df_filtered[df_filtered['Path'].apply(lambda x: os.path.exists(x))]
df_filtered

## 증복 텍스트 삭제

In [None]:
df_final = df_filtered.drop_duplicates(subset = 'Text', keep = 'first')
df_final

In [None]:
df_final.to_csv('df.csv', index = False, encoding = 'utf-8-sig')