In [None]:
import googleapiclient.discovery
import re
! pip install youtube_transcript_api
from youtube_transcript_api import YouTubeTranscriptApi
# API information

api_service_name = "youtube"
api_version = "v3"
# Set your API key
API_KEY = ""
# API client
youtube = googleapiclient.discovery.build(
    api_service_name, api_version, developerKey = API_KEY)

In [30]:
# read all lines from a file
def read_file_lines(file_path):
    lines = []
    try:
        with open(file_path, 'r') as file:
            lines = [line.strip() for line in file.readlines()]
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' does not exist.")
    except Exception as e:
        print(f"Error: An error occurred while reading the file: {e}")

    return lines

# get videos which have a timeline in its description
def youtubeSearch (keyword, iterNum = 1):

  nextToken = ""
  result = []
  for i in range(0,iterNum):
    request = youtube.search().list(part="id,snippet",
        type='video',
        q=keyword,
       # videoDuration='medium',
        maxResults=50,
        pageToken=nextToken,
        fields="nextPageToken,items(id(videoId),snippet(title,description))"
    )

# Query execution
    response = request.execute()

    found = False
    for item in response['items']:
      des = item['snippet']['description']
      timeLine = re.findall(r"(?:[0-5]?\d):(?:[0-5]?\d)",des)
      if len(timeLine) > 1:
        found = True
        fullTimeline = getVideoTimelineById(item['id']['videoId'])
        result.append({"id":item['id']['videoId'] , "url":"https://www.youtube.com/watch?v="+item['id']['videoId'], "title": item['snippet']['title'],"timeline": fullTimeline['timeline'], "description": fullTimeline['des']})
        #break
    #if(found): return result
    nextToken = response['nextPageToken']
  return result

######## getVideoTimelineById
def getVideoTimelineById (videoId):
  request = youtube.videos().list(part="id,snippet",
                                  id = videoId
  )
  response = request.execute()
  des = response['items'][0]['snippet']['description']
  pattern = re.compile(r"((?:(?:[01]?\d|2[0-3]):)?(?:[0-5]?\d):(?:[0-5]?\d))(.+)")

# find all matches to groups
  timeline = []
  result = {}
  for match in pattern.finditer(des):
    timeline.append({"time": match.group(1), "label": match.group(2)})
  result = {"des": des, "timeline": timeline}
  return result

def getTranscriptByVideoId(videoId):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(videoId, languages=['en'])
        return transcript
    except Exception as e:
        #print(f"Error: {e}")
        return None
    transcript = get_transcript(video_id)

def time_to_number(time_str):
    time_parts = time_str.split(":")
    total_seconds = 0

    if len(time_parts) == 2:
        # Format: [MM:SS]
        total_seconds = int(time_parts[0]) * 60 + int(time_parts[1])
    elif len(time_parts) == 3:
        # Format: [HH:MM:SS]
        total_seconds = int(time_parts[0]) * 3600 + int(time_parts[1]) * 60 + int(time_parts[2])

    return total_seconds

#get videos from playlist
def youtubeGetPlaylist (playlistId, iterNum = 1):

  nextToken = ""
  result = []
  for i in range(0,iterNum):
    request = youtube.playlistItems().list(
        part="snippet",
        maxResults=50,
        playlistId=playlistId,
        pageToken=nextToken,
        fields="nextPageToken,items(snippet(resourceId(videoId),title,description))"
    )

# Query execution
    response = request.execute()

    found = False
    for item in response['items']:
      des = item['snippet']['description']
      timeLine = re.findall(r"(?:[0-5]?\d):(?:[0-5]?\d)",des)
      if len(timeLine) > 1:
        found = True
        fullTimeline = getVideoTimelineById(item['snippet']['resourceId']['videoId'])
        result.append({"id":item['snippet']['resourceId']['videoId'] , "url":"https://www.youtube.com/watch?v="+item['snippet']['resourceId']['videoId'], "title": item['snippet']['title'],"timeline": fullTimeline['timeline'], "description": fullTimeline['des']})
        #break
    #if(found): return result
    nextToken = response.get('nextPageToken')
  return result

In [None]:

youtubeGetPlaylist('PLTjRvDozrdlxzQet01qZBt-sRG8bbDggv')


In [33]:
# get segments by search

def getSegmentsBySearch(keywordStart):
  videoIds = read_file_lines('/content/drive/MyDrive/0data/videoIds.txt')
  seperator = '=========='
  keywords = read_file_lines('/content/drive/MyDrive/0data/coursesNames.txt')# path to the file which contains keywords for search
  for keyIndex in range(keywordStart, len(keywords)):
    keyword = keywords[keyIndex]
    print("keyword: "+ keyword + "index: "+str(keyIndex))
    ids_file = open("/content/drive/MyDrive/0data/videoIds.txt", "a")
    videosWithTimelines = youtubeSearch(keyword)
    for video in videosWithTimelines:
      videoId = video['id']
      if videoId in videoIds:
        continue
      transcript = getTranscriptByVideoId(videoId)
      if transcript != None:
        videoIds.append(videoId)
        ids_file.write(videoId+ "\n")
        fw = open("/content/drive/MyDrive/0data/youtube/"+videoId+".txt", "w")
        timeline = video['timeline']
        transcriptLen = len(transcript)
        transcriptRangeStartAt = 0
        for timelineItem in timeline:
          sectionStartTime = time_to_number(timelineItem['time'])
          if(sectionStartTime == 0):
            continue
          for t_i in range(transcriptRangeStartAt, transcriptLen):
            startTime = int(transcript[t_i]['start'])
            if(startTime < sectionStartTime):
              fw.write(transcript[t_i]['text']+"\n")
            else:
              fw.write(seperator+"\n")
              fw.write(transcript[t_i]['text']+"\n")
              transcriptRangeStartAt = t_i + 1
              break
          for t_i in range(transcriptRangeStartAt, transcriptLen):
            fw.write(transcript[t_i]['text']+"\n")
        fw.close()



In [41]:
# get segments from playlists

def getSegmentsFromPlaylists(keywordStart, itrNum = 1):
  videoIds = read_file_lines('/content/drive/MyDrive/0data/videoIds.txt')
  seperator = '=========='
  keywords = read_file_lines('/content/drive/MyDrive/0data/playlistIds.txt')# path to the file which contains the playlist ids
  for keyIndex in range(keywordStart, len(keywords)):
    keyword = keywords[keyIndex]

    ids_file = open("/content/drive/MyDrive/0data/videoIds.txt", "a")
    videosWithTimelines = youtubeGetPlaylist(keyword, itrNum)
    print("keyword: "+ keyword + "\tindex: "+str(keyIndex)+ "\t#videos: "+str(len(videosWithTimelines)))
    for video in videosWithTimelines:
      videoId = video['id']
      print(videoId+"\t"+str(videoId in videoIds))
      if videoId in videoIds:
        continue
      transcript = getTranscriptByVideoId(videoId)
      if transcript != None:
        videoIds.append(videoId)
        ids_file.write(videoId+ "\n")
        fw = open("/content/drive/MyDrive/0data/youtube/"+videoId+".txt", "w")
        timeline = video['timeline']
        transcriptLen = len(transcript)
        transcriptRangeStartAt = 0
        for timelineItem in timeline:
          sectionStartTime = time_to_number(timelineItem['time'])
          if(sectionStartTime == 0):
            continue
          for t_i in range(transcriptRangeStartAt, transcriptLen):
            startTime = int(transcript[t_i]['start'])
            if(startTime < sectionStartTime):
              fw.write(transcript[t_i]['text']+"\n")
            else:
              fw.write(seperator+"\n")
              fw.write(transcript[t_i]['text']+"\n")
              transcriptRangeStartAt = t_i + 1
              break
          for t_i in range(transcriptRangeStartAt, transcriptLen):
            fw.write(transcript[t_i]['text']+"\n")
        fw.close()

In [35]:
# get segments By Video Ids

def getSegmentsByVideoIds(keywordStart):
  videoIds = read_file_lines('/content/drive/MyDrive/0data/videoIds.txt')
  seperator = '=========='
  keywords = read_file_lines('/content/drive/MyDrive/0data/inputVideoIds.txt')
  for keyIndex in range(keywordStart, len(keywords)):
    keyword = keywords[keyIndex]

    ids_file = open("/content/drive/MyDrive/0data/videoIds.txt", "a")
    videosWithTimelines = [getVideoTimelineById(keyword)]
    print("keyword: "+ keyword + "\tindex: "+str(keyIndex)+ "\t#videos: "+str(len(videosWithTimelines)))
    for video in videosWithTimelines:
      videoId = keyword
      print(videoId+"\t"+str(videoId in videoIds))
      if videoId in videoIds:
        continue
      transcript = getTranscriptByVideoId(videoId)
      if transcript != None:
        videoIds.append(videoId)
        ids_file.write(videoId+ "\n")
        fw = open("/content/drive/MyDrive/0data/youtube/"+videoId+".txt", "w")
        timeline = video['timeline']
        transcriptLen = len(transcript)
        transcriptRangeStartAt = 0
        for timelineItem in timeline:
          sectionStartTime = time_to_number(timelineItem['time'])
          if(sectionStartTime == 0):
            continue
          for t_i in range(transcriptRangeStartAt, transcriptLen):
            startTime = int(transcript[t_i]['start'])
            if(startTime < sectionStartTime):
              fw.write(transcript[t_i]['text']+"\n")
            else:
              fw.write(seperator+"\n")
              fw.write(transcript[t_i]['text']+"\n")
              transcriptRangeStartAt = t_i + 1
              break
          for t_i in range(transcriptRangeStartAt, transcriptLen):
            fw.write(transcript[t_i]['text']+"\n")
        fw.close()


In [None]:
# example 1 
# used to get the trascript segments from playlist 
getSegmentsFromPlaylists(0,2)

In [None]:
# example 2 
# used to get the trascript segments by seach 
getSegmentsBySearch(0)