In [17]:
import sys
sys.path.append('..')
import json
import re
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import JSONFormatter

In [18]:
def open_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as infile:
        return infile.read()


def save_file(content, filepath):
    with open(filepath, 'w', encoding='utf-8') as outfile:
        outfile.write(content)

In [19]:
def get_transcript(video_id):
    if not video_id:
        raise Exception('Video ID not found')

    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])

        formatter = JSONFormatter()
        text = formatter.format_transcript(transcript)
        # text = re.sub('\s+', ' ', text).replace('--', '')
        return text

    except Exception as e:
        raise Exception('Could not download the transcript')

In [20]:
def slice_transcript(transcript, end, start=0):             # example: start | end = 652 (seconds)
    if start > end[0]: raise Exception('Start is ahead of End')

    text = []
    for obj in transcript:
        if start > obj['start']: continue
        if obj['start'] > end:
            return re.sub('\s+', ' ', ' '.join(text))
        text.append(obj['text'])


def slice_transcript(transcript, end=list(), start=list()):     # example: end=list[34.56, 66.45]; start=65.32 (seconds)
    if not len(end): raise Exception('Missing parameter: end')
    if start[0] > end[0]: raise Exception('Start is ahead of End')

    chapters = []
    text = []
    checkpoint = 0
    for obj in transcript:
        if start[0] > obj['start']: continue
        if checkpoint >= len(end): return chapters

        if obj['start'] > end[checkpoint]:
            chapters.append((start[checkpoint], re.sub('\s+', ' ', ' '.join(text))))
            text.clear()
            checkpoint += 1
        text.append(obj['text'])

    return chapters


In [21]:
def time_to_seconds(time_str):
    parts = [int(part) for part in time_str.split(":")]
    if len(parts) == 2:
        return parts[0] * 60 + parts[1]
    elif len(parts) == 3:
        return parts[0] * 3600 + parts[1] * 60 + parts[2]
    else:
        raise ValueError("Invalid time format: " + time_str)

def transform_chapter_timestamp(chapters):
    '''
    This function expects chapter with timestamp as string. The last item should be the 'End' timestamp, indicating the end of the video.
        ["00:00 Chapter Title"]
        ["04:15 Chapter Title"]
        ["07:21 Chapter Title"]
        ["10:31 End"]
    '''

    chapter_list = [line.strip().split(" ", 1) for line in chapters]

    # chapter titles
    chapter_titles = [chapter[1] for chapter in chapter_list]

    # chapter start ts
    chapter_starts = []
    for chapter in chapter_list:
        if chapter[0][0].isdigit():
            chapter_starts.append(time_to_seconds(chapter[0]))
        else:
            chapter_starts.append(time_to_seconds(chapter[0][1:-1]))

    # chapter end & start ts
    chapter_ends = chapter_starts[1:]
    chapter_starts = chapter_starts[:-1]    # exclude the 'End' timestamp

    return list(zip(chapter_starts, chapter_ends, chapter_titles))

# Action

In [22]:
video_id = 'DnEJrgc1BCk'

chapters = '''
0:00 Intro
02:02  What is it that you do and why does it matter? 
15:14 Why glucose?
26:45 The symptoms of bad glucose spikes 
35:06 What is glucose?
38:06 What happens to our bodies when we have a glucose spike?
43:44 Glucose as it relates to weight gain
48:30 10 Hacks to prevent glucose spikes
01:02:14 The right meal to have for breakfast
01:09:26 Why you should be drinking vinegar
01:11:54 You have to be doing this after you eat 
01:14:46 Your perfect diet 
01:24:24 Our conversation cards
01:31:18 The last guest’s question
01:35:10 End
'''

In [23]:
chapter_list = chapters.strip().split("\n")
chapter_ts = transform_chapter_timestamp(chapter_list)

ch_starts = [ch[0] for ch in chapter_ts]
ch_ends = [ch[1] for ch in chapter_ts]
print('start:', ch_starts, 'end:', ch_ends)

start: [0, 122, 914, 1605, 2106, 2286, 2624, 2910, 3734, 4166, 4314, 4486, 5064, 5478] end: [122, 914, 1605, 2106, 2286, 2624, 2910, 3734, 4166, 4314, 4486, 5064, 5478, 5710]


In [24]:
max_prompt_len = 15000
# download transcript
transcript = json.loads(get_transcript(video_id))
save_file(json.dumps(transcript), f'./chapters/transcript_{video_id}.txt')

# multiple slice
transcripts = slice_transcript(transcript, ch_ends, ch_starts)

for chapter in transcripts:
    prompt = open_file('prompt_chapter_summary.txt').replace('<<TEXT>>', chapter[1])
    if len(prompt) > max_prompt_len:
        prompt = prompt[:max_prompt_len]
    save_file(prompt, f'./chapters/chapter_{video_id}_{chapter[0]}.txt')