

# 1. Audio Data Acquisition

### 1.1 Download and extract audio from YouTube link

In [None]:
!pip install pytube

In [None]:
import os
from pytube import YouTube

#specify youtube link 
yt = YouTube('YOUTUBE_LINK_TO_AUDIO_SOURCE.com')
t = yt.streams.filter(only_audio=True).first().download()
s = t.replace('.mp4','')
os.rename(t, s+'.wav')

### 1.2 Download .srt file of the video
- online tool : https://downsub.com/
- requires the same url as the audio source

# 2. Data Preprocessing

In [None]:
!pip install pydub
!pip install ffmpeg
!pip install pandas

In [None]:
import pydub
from pydub import AudioSegment
import pandas as pd

### 2.1 Convert .srt file into .xlsx files for easier processing
- online tool : https://conversiontools.io/convert/srt-to-excel

In [None]:
#specify the name of the audio source
subs = pd.read_excel("DRAMA-EP1.xlsx")
subs.index = subs.index + 1
subs

### 2.2 Transcript text cleaning

In [None]:
subs["transcript"] = subs["text"].str.replace('[!,?,\n,.,-]', ' ')
subs["transcript"] = subs["transcript"].str.lower()
subs.drop(['text'], axis=1,inplace=True)
subs

### 2.3 Create new folder to store audio that will be trimmed later
- if folder exists, this segment may be ignored

In [None]:
import os 
    
# specify the directory path for trimmed audio storage
path1 = 'path/to/DRAMA-DATA/wav_path'
     
try: 
    os.mkdir(path1)
except OSError as error: 
    print(error)

### 2.4 Trim .wav audio based on the timestamp for each speech transcript

- this simple script trims audio files based on their respective .srt timestamps.
- outputs are stored based on the transcript index number 

In [None]:
audio_file= "DRAMA-EP1.wav" 
audio = AudioSegment.from_file(audio_file)
list_of_timestamps = subs['end_s'] 

start = 7960 #in milliseconds. (example: data from start_s column * 1000 ---> (start_s * 1000))
    
for  idx,t in enumerate(list_of_timestamps):
    #breaks loop if at the last element of the list
    if idx == len(list_of_timestamps):
        break
    
    end = t *1000 #in milliseconds
    print ("split at [ {}:{}] ms".format(start, end))
    audio_chunk=audio[start:end]
    audio_chunk.export(path2 + "bidadari_ep7_{}.wav".format(idx + 1), format="wav")

    start = end 

### 2.5 Separate each row from trasncript into individual .txt file
- creates .txt files for each .wav file
- for ASR model training and fine-tuning purposes
- also creates a sheet in the previous excel file which pairs each .wav with their respective .txt for ease of data management

In [None]:
!pip install XlsxWriter

In [None]:
text = subs["transcript"]
text_df = text.to_frame()
text_df

In [None]:
import os 


# specify the directory path for .txt file storage. keep in separate folder from .wav files
path2 = 'path/to/DRAMA-DATA/transcipt_path'
     
try: 
    os.mkdir(path2)
except OSError as error: 
    print(error)

In [None]:
import csv
import xlsxwriter

workbook = xlsxwriter.Workbook('DRAMA-EP1.xlsx')
worksheet = workbook.add_worksheet("samples")

worksheet.set_column('A:A', 20)
worksheet.set_column('B:B', 20)
worksheet.write('A1', 'wav_path')
worksheet.write('B1', 'transcript_path')

i = 1
e = 1
for index, row in text_df.iterrows():
    if i > len(text_df):
       break
    else:
        f = open(path2+'DRAMA_EP1_'+str(i)+'.txt', 'w')
        worksheet.write(e, 0, "DRAMA_EP1_{}.wav".format(e))
        worksheet.write(e, 1, "DRAMA_EP1_{}.txt".format(e))
        f.write(row[0])
        f.close()
        i+=1
        e+=1

workbook.close()