In [1]:
import openai
import json
import re
from time import time, sleep
import sys

from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import JSONFormatter, TextFormatter

sys.path.append('../src/')
# from summarizer import get_transcript
# from summarizer import gpt3_completion

In [2]:
basedir = '../src/'
video_id = '0lJKucu6HJc'
video_id = 'uh1GRQtKjLo'

In [3]:
def save_file(content, filepath):
    with open(filepath, 'w', encoding='utf-8') as outfile:
        outfile.write(content)

In [4]:
def gpt3_completion(prompt, model='text-davinci-003', temp=0.7, top_p=1.0, tokens=500, freq_pen=0.25, pres_pen=0.0, stop=['###']):
    max_retry = 1
    retry = 0
    while True:
        try:
            response = openai.Completion.create(
                model=model,
                prompt=prompt,
                temperature=temp,
                max_tokens=tokens,
                top_p=top_p,
                frequency_penalty=freq_pen,
                presence_penalty=pres_pen,
                stop=stop)
            text = response['choices'][0]['text'].strip()
            text = re.sub('\s+', ' ', text)
            if not text:
                retry += 1
                continue
            filename = f'gpt3_{video_id}_{time()}.log'
            with open(f'{basedir}/logs/{filename}', 'w') as outfile:
                outfile.write('PROMPT:\n\n' + prompt + '\n\n==========\n\nRESPONSE:\n\n' + text)
            return text

        except Exception as e:
            retry += 1
            if retry >= max_retry:
                raise Exception(f'GPT3 error: {str(e)}')
            # sleep(1)


In [5]:
def get_transcript(video_id):
    if not video_id:
        raise Exception('Video ID not found')

    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])

        formatter = JSONFormatter()
        text = formatter.format_transcript(transcript)
        # text = re.sub('\s+', ' ', text).replace('--', '')
        return text

    except Exception as e:
        raise Exception('Could not download the transcript')

In [52]:
def slice_transcript(transcript, end, start=0):             # example: start | end = 652 (seconds)
    if start > end[0]: raise Exception('Start is ahead of End')

    text = []
    for obj in transcript:
        if start > obj['start']: continue
        if obj['start'] > end:
            return re.sub('\s+', ' ', ' '.join(text))
        text.append(obj['text'])


def slice_transcript(transcript, end=list(), start=list()):     # example: end=list[34.56, 66.45]; start=65.32 (seconds)
    if not len(end): raise Exception('Missing parameter: end')
    if start[0] > end[0]: raise Exception('Start is ahead of End')

    chapters = []
    text = []
    checkpoint = 0
    for obj in transcript:
        if start[0] > obj['start']: continue
        if checkpoint >= len(end): return chapters

        if obj['start'] > end[checkpoint]:
            chapters.append((start[checkpoint], re.sub('\s+', ' ', ' '.join(text))))
            text.clear()
            checkpoint += 1
        text.append(obj['text'])

    return chapters


In [53]:
# download transcript
transcript = json.loads(get_transcript(video_id))
# save_file(json.dumps(transcript), './transcript.txt')

# # slice once
# transcript = slice_transcript(transcript, 300, 100)
# multiple slice
transcripts = slice_transcript(transcript, [315, 440, 630, 720, 910], [0, 315, 440, 630, 720])

save_file(json.dumps(transcripts), './transcript_text.txt')

In [44]:
def time_to_seconds(time_str):
    parts = [int(part) for part in time_str.split(":")]
    if len(parts) == 2:
        return parts[0] * 60 + parts[1]
    elif len(parts) == 3:
        return parts[0] * 3600 + parts[1] * 60 + parts[2]
    else:
        raise ValueError("Invalid time format: " + time_str)

def transform_chapter_timestamp(chapters):
    '''
    This function expects chapter with timestamp as string. The last item should be the 'End' timestamp, indicating the end of the video.
        ["00:00 Chapter Title"]
        ["04:15 Chapter Title"]
        ["07:21 Chapter Title"]
        ["10:31 End"]
    '''

    chapter_list = [line.strip().split(" ", 1) for line in chapters]

    # chapter titles
    chapter_titles = [chapter[1] for chapter in chapter_list]

    # chapter start ts
    chapter_starts = []
    for chapter in chapter_list:
        if chapter[0][0].isdigit():
            chapter_starts.append(time_to_seconds(chapter[0]))
        else:
            chapter_starts.append(time_to_seconds(chapter[0][1:-1]))

    # chapter end & start ts
    chapter_ends = chapter_starts[1:]
    chapter_starts = chapter_starts[:-1]    # exclude the 'End' timestamp

    return list(zip(chapter_starts, chapter_ends, chapter_titles))

In [45]:
chapters = '''
(00:00) Gustafs background
(05:15) What made Airbnb so special
(07:20) How culture interviews and hiring founders contributed to Airbnbs success
(10:30) Motivations for starting companies
(12:00) Why Gustaf helps founders understand their motivations
(15:10) End
'''

In [48]:
chapter_list = chapters.strip().split("\n")
chapter_ts = transform_chapter_timestamp(chapter_list)

ch_starts = [ch[0] for ch in chapter_ts]
ch_ends = [ch[1] for ch in chapter_ts]
print(ch_starts, ch_ends)

[0, 315, 440, 630, 720] [315, 440, 630, 720, 910]


In [61]:
import os
os.path.abspath(os.curdir)

'c:\\Users\\cn4tvne\\OneDrive - Allianz\\Personal (1Drive)\\3Hustlers\\last_week_in_podcast\\docs'