In [47]:
from youtube_transcript_api import YouTubeTranscriptApi
from nltk.tokenize import TextTilingTokenizer  
import pandas as pd
import numpy as np
import requests
import json

url = "https://www.youtube.com/watch?v=VcVfceTsD0A&t=163s"
video_id = url.split("=")[1]

raw = YouTubeTranscriptApi.get_transcript(video_id)

response = requests.get(f"https://noembed.com/embed?dataType=json&url={url}")
data = json.loads(response.content)

title, author = data["title"], data["author_name"]

In [48]:
# Convert the list of dictionaries to a pandas dataframe
df = pd.DataFrame(raw)

# Add end column
df['end'] = df['start'] + df['duration']

# Add a new column to the dataframe called 'total_words' that contains the total number of words so far in the transcript
df['total_words'] = df['text'].apply(lambda x: len(x.split())).cumsum()

# Add "\n\n" at the end of df["text"]
df["text"] = df["text"] + "\n\n"

In [50]:
# Merge the text column into a single string and save to a transcript variable

transcript = df['text'].str.cat(sep=' ')

In [51]:
tt = TextTilingTokenizer()

# Tokenize the transcript into segments using the TextTilingTokenizer
segments = tt.tokenize(transcript)

In [52]:
# # Remove \n\n from each segment
segments = [segment.replace('\n\n','').strip() for segment in segments]

In [53]:
# Calculate a list of word count for each segment
segments_wc = [len(segment.split()) for segment in segments]

# Make it cumulative
segments_wc = np.cumsum(segments_wc)

In [54]:
def to_timestamp(seconds):

    seconds = int(seconds)

    minutes = seconds // 60
    seconds_remaining = f"{seconds % 60}"
    
    if len(seconds_remaining) == 1:
        seconds_remaining = "0" + seconds_remaining

    return f"{minutes}:{seconds_remaining}"

In [55]:
# For each value in segments_wc, get the index of the closest value in df['total_words']
# This will be the index of the row in df that is closest to the end of each segment
idx = [np.argmin(np.abs(df['total_words'] - total_words)) for total_words in segments_wc]

# Get segment end times from idx
segment_end_times = df['end'].iloc[idx].values

# Add 0.0 to the beginning of segment_end_times
segment_end_times = np.insert(segment_end_times, 0, 0.0)

# segment_times is a list of tuples containing the start and end times of each segment
segment_times = [(to_timestamp(segment_end_times[i-1]), to_timestamp(segment_end_times[i])) for i in range(1,len(segment_end_times))]

In [56]:
# At the beginning of each segment, add the title, author, and segment times
segment_text = [f"'{title}' by {author}\nTimestamp: {segment_time}\n\n{segment}" for segment, segment_time in zip(segments, segment_times)]