### **Initial setup**

In [1]:
import os

import re
import json
import pandas as pd
from googleapiclient.discovery import build

from dotenv import load_dotenv
load_dotenv("../.env")

True

In [2]:
DATA_PATH = "../data/experiments"
os.makedirs(DATA_PATH, exist_ok = True)

In [3]:
YOUTUBE_API_KEY = os.environ.get("YOUTUBE_API_KEY")
youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)

### **Useful functions**

In [6]:
def getVideosFromChannel(channelID):
  # Get Uploads playlist id
  res = youtube.channels().list(id=channelID, part='contentDetails').execute()
  playlistID = res['items'][0]['contentDetails']['relatedPlaylists']['uploads']

  videos = []
  nextPageToken = None

  while True:
    res = youtube.playlistItems().list(playlistId=playlistID, part='snippet', maxResults=50, pageToken=nextPageToken).execute()
    videos += res['items']
    nextPageToken = res.get('nextPageToken')

    if not nextPageToken:
      break

  return videos

In [7]:
def extractDescriptionFromVideo(videoInfo):
  description = videoInfo['snippet']['description']
  return description

In [87]:
def extractIDFromVideo(videoInfo):
    videoID = videoInfo["snippet"]["resourceId"]["videoId"]
    return videoID

In [176]:
def extractChaptersFromVideoDescription(videoDescription):
    hasTimestampRegex = r".*((?:\d+:)+\d+).*"

    descriptionLines = videoDescription.split("\n")
    chaptersLines = [line for line in descriptionLines if re.match(hasTimestampRegex, line)]
    
    if len(chaptersLines) == 0:
        raise Exception("No chapters were detected.")

    pattern1 = r"((?:\d+:)+\d+)[\W\t\s-]+(.*)"
    pattern2 = r"(.*)[\W\t\s-]+((?:\d+:)+\d+)"

    chapters = []

    for chapterLine in chaptersLines:
        if re.match(pattern1, chapterLine):
            chapters.append(re.findall(pattern1, chapterLine)[0])
        
        elif re.match(pattern2, chapterLine):
            chapters.append(re.findall(pattern2, chapterLine)[0][::-1])

        else:
            raise Exception(f"It was not possible to parse the following chapter: '{chapterLine}'.")
        
    return chapters

In [177]:
def extractChaptersFromVideos(videos):
    data = []
    columns = ["videoID", "description", "chapterOrder", "startTimestamp", "chapterTitle"]
    for video in videos:
        videoID = extractIDFromVideo(video)
        description = extractDescriptionFromVideo(video)
        try:
            chapters = extractChaptersFromVideoDescription(description)
        except Exception as e:
            print(videoID, e.__str__())
        chapters = [[videoID, description, i, chapter[0], chapter[1]] for i, chapter in enumerate(chapters)]
        data += chapters
    return pd.DataFrame(data = data, columns = columns)

### **FreeCodeCamp**

In [178]:
# Uncomment the following lines to download the data
# FREE_CODE_CAMP_CHANNEL_ID = "UC8butISFwT-Wl7EV0hUK0BQ"
# videos = getVideosFromChannel(FREE_CODE_CAMP_CHANNEL_ID)
# with open(f"{DATA_PATH}/freeCodeCampVideos.json", "w") as f:
#   f.write(json.dumps(videos))

In [179]:
videos = json.load(open(f"{DATA_PATH}/freeCodeCampVideos.json"))