<a href="https://colab.research.google.com/github/Yewon9/STT/blob/main/EDA_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# IMPORT

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [21]:
import pandas as pd
from pydub import AudioSegment
from tqdm import tqdm
import matplotlib.pyplot as plt
import librosa
import numpy as np
import seaborn
import re

# 일반 음성 데이터

In [14]:
general = pd.read_csv('general.csv')

## 데이터 개수 확인

In [15]:
print(general.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37196 entries, 0 to 37195
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Path    37196 non-null  object
 1   Text    37196 non-null  object
dtypes: object(2)
memory usage: 581.3+ KB
None


In [16]:
print(general.describe())

                                                     Path              Text
count                                               37196             37196
unique                                              37196             37196
top     /content/drive/MyDrive/project/data_aihub/4/5e...  어, 청소 니가 대신 해 줘!
freq                                                    1                 1


In [17]:
missing_text = general[general['Text'] == '']
print(f"빈 텍스트 개수: {len(missing_text)}")

빈 텍스트 개수: 0


In [18]:
duplicate_texts = general['Text'].duplicated().sum()
print(f"중복된 텍스트 개수: {duplicate_texts}")

중복된 텍스트 개수: 0


## 음성 파일 길이 분석

In [19]:
def get_audio_length(path):
  audio = AudioSegment.from_wav(path)
  return len(audio) / 1000

In [None]:
general['Audio_Length'] = [get_audio_length(path) for path in tqdm(general['Path'])]

 13%|█▎        | 4925/37196 [36:01<3:17:50,  2.72it/s]

# 시각화 분포 확인 (히스토 or 박스)

In [None]:
# 이상치 제거

## 텍스트 분석

### 텍스트 길이 분석

In [None]:
general['Text_Length'] = general['Text'].apply(len)

In [None]:
print(general[['Text', 'Text_Length']].head())

In [None]:
plt.figure(figsize = (10, 6))
plt.hist(general['Text_Length'], bins = 30, color = 'blue', alpha = 0.7)
plt.title('Distribution of Text Lengths')
plt.xlabel('Text Length (characters)')
plt.ylabel('Frequency')
plt.show()

### 빈 텍스트 확인

In [None]:
empty_texts = general[general['Text_Length'] == 0]
print(f"빈 텍스트 개수: {len(empty_texts)}")

### 중복된 텍스트 확인

In [None]:
duplicate_texts = general[general['Text'].duplicated()]
print(f"중복된 텍스트 개수: {len(duplicate_texts)}")

### 불필요한 기호 제거

In [None]:
unique_chars = set(''.join(general['Text']))
print(unique_chars)

In [None]:
def clean_text(text):
    cleaned_text = re.sub(r'[+u/I*]', '', text)
    return cleaned_text

In [None]:
general['Cleaned_Text'] = general['Text'].apply(clean_text)

In [None]:
print(general[['Text', 'Cleaned_Text']].head())

## 데이터 품질 검사

In [None]:
def calculate_snr(path):
    audio, sr = librosa.load(path)
    signal_power = np.mean(audio ** 2)
    noise_power = np.mean((audio - np.mean(audio)) ** 2)
    snr = 10 * np.log10(signal_power / noise_power)
    return snr

In [None]:
general['SNR'] = [calculate_snr(path) for path in tqdm(general['Path'])]

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(general['SNR'], bins = 30, color = 'blue', kde = True)
plt.axvline(x = 10, color = 'red', linestyle='--', label = '10 dB')
plt.axvline(x = 20, color = 'green', linestyle='--', label = '20 dB')
plt.title('SNR Distribution')
plt.xlabel('SNR (dB)')
plt.ylabel('Frequency')
plt.legend()
plt.show()

In [None]:
# 10 또는 20
noisy_files = general[general['SNR'] < 20]
print(noisy_files[['Path', 'SNR']])