In [1]:
import sys
sys.path.append("..")

import re
import json
import pickle
import pandas as pd
import webvtt
from io import StringIO
from pytube import YouTube
from datetime import datetime, timedelta
from transformers import AutoModelWithLMHead, AutoTokenizer
from utils.youtube import extractTranscriptsFromVideoID
from utils.splitter import WebVTTTextSplitter
from utils.helper import cleanText
from time import sleep
from tqdm import tqdm

from langchain import PromptTemplate
from langchain.llms import OpenAI
from langchain.chains import LLMChain

from dotenv import load_dotenv
load_dotenv("../env")

  from .autonotebook import tqdm as notebook_tqdm


False

# **Preparing Dataset**

In [6]:
DATA_DIR = "../data/experiments"
DATA_PATH = f"{DATA_DIR}/freeCodeCampVideos.json"

In [7]:
videosInfo = None
with open(DATA_PATH) as f:
    videosInfo = json.load(f)

In [4]:
print(f"Total videos: {len(videosInfo)}")

Total videos: 1450


In [5]:
# Uncomment the following lines to extract data from raw videos JSON and to store them in 
# noChaptersData = []
# noChaptersColumns = ["videoID", "videoTitle", "videoDescription"]

# withChaptersData = []
# withChaptersColumns = ["videoID", "videoTitle", "videoDescription", "videoChapters", "videoTranscripts", "videoTranscriptsType"]

# for video in tqdm(videosInfo):
#     videoTitle = video["snippet"]["title"]
#     videoID = video["snippet"]["resourceId"]["videoId"]
#     videoDescription = video["snippet"]["description"]
#     videoChapters = pd.DataFrame(
#         data = re.findall(r"(.*?)(\d+:\d+:\d+|\d+:\d+)(.*)", videoDescription), 
#         columns = ["beforeTimestamp", "startTimestamp", "afterTimestamp"]
#     )
#     if not videoChapters.empty:
#         videoChapters["chapterTitle"] = videoChapters.apply(lambda row: re.sub(r"[^a-zA-Z0-9\.\? ]", "", row["beforeTimestamp"] + row["afterTimestamp"]).strip(), axis = 1)
#         videoChapters = videoChapters[["startTimestamp", "chapterTitle"]].values
#         try:
#             videoTranscripts, videoTranscriptsType = extractTranscriptsFromVideoID(videoID, preprocessFunction = cleanText)
#         except:
#             videoTranscripts, videoTranscriptsType = None, None
#         withChaptersData.append([videoID, videoTitle, videoDescription, videoChapters, videoTranscripts, videoTranscriptsType])
        
#         sleep(1)
#     else:
#         noChaptersData.append([videoID, videoTitle, videoDescription])

# withChaptersDF = pd.DataFrame(data = withChaptersData, columns = withChaptersColumns)
# noChaptersDF = pd.DataFrame(data = noChaptersData, columns = noChaptersColumns)

# withChaptersDF.to_csv(f"{DATA_DIR}/withChapters.csv", index = False, lineterminator = "\n")
# with open(f"{DATA_DIR}/withChapters.pkl", "wb") as f:
#     f.write(pickle.dumps(withChaptersDF))

# noChaptersDF.to_csv(f"{DATA_DIR}/noChapters.csv", index = False, lineterminator = "\n")
# with open(f"{DATA_DIR}/noChapters.pkl", "wb") as f:
#     f.write(pickle.dumps(noChaptersDF))

In [18]:
withChaptersDF = pickle.load(open(f"{DATA_DIR}/withChapters.pkl", "rb"))
print(f"Total videos with chapters: {withChaptersDF.shape[0]}")

# I will not use this one because these videos do not contain chapters
noChaptersDF = pickle.load(open(f"{DATA_DIR}/noChapters.pkl", "rb"))
print(f"Total videos without chapters: {noChaptersDF.shape[0]}")

Total videos with chapters: 584
Total videos without chapters: 866


In [24]:
withChaptersAndManualTranscriptsDF = withChaptersDF[withChaptersDF.videoTranscriptsType == "manual"]
withChaptersAndManualTranscriptsDF.to_csv(f"{DATA_DIR}/withChaptersAndManualTranscripts.csv", index = False, lineterminator = "\n")
with open(f"{DATA_DIR}/withChaptersAndManualTranscripts.pkl", "wb") as f:
    f.write(pickle.dumps(withChaptersAndManualTranscriptsDF))

withChaptersAndManualTranscriptsDF = pickle.load(open(f"{DATA_DIR}/withChaptersAndManualTranscripts.pkl", "rb"))
print(f"Total videos with chapters and manual transcripts: {withChaptersAndManualTranscriptsDF.shape[0]}")
withChaptersAndManualTranscriptsDF.head(3)

Total videos with chapters and manual transcripts: 225


Unnamed: 0,videoID,videoTitle,videoDescription,videoChapters,videoTranscripts,videoTranscriptsType
7,krfUjg0S2uI,"Build a Simple Website with HTML, CSS, JavaScr...","Improve your skills in JavaScript, HTML, and C...","[[0:00:00, Intro], [0:01:26, Functional requir...",WEBVTT\n\n00:00:00.000 --> 00:00:06.800\nPract...,manual
9,tbqVqP5ilzQ,"JavaScript Interview Prep: Functions, Closures...",Prepare for JavaScript interview questions foc...,"[[0:00:00, Intro], [0:01:48, Function Declarat...",WEBVTT\n\n00:00:00.000 --> 00:00:04.960\nThis ...,manual
10,uRQH2CFvedY,ChatGPT Course – Use The OpenAI API to Code 5 ...,Learn how to use the OpenAI API to create five...,"[[0:00:00, Introduction], [0:02:57, Authentica...",WEBVTT\n\n00:00:00.000 --> 00:00:06.080\nWelco...,manual


In [214]:
withChaptersAndManualTranscriptsDF["duration"] = withChaptersAndManualTranscriptsDF.videoID.apply(lambda videoID: YouTube(f"https://www.youtube.com/watch?v={videoID}").length)

# Only videos with less than 1 day of duration (i.e., less than 86400 seconds)
totalSecondsInOneDay = 86400
withChaptersAndManualTranscriptsDF = withChaptersAndManualTranscriptsDF[withChaptersAndManualTranscriptsDF["duration"] < totalSecondsInOneDay]
withChaptersAndManualTranscriptsDF["duration"] = withChaptersAndManualTranscriptsDF["duration"].apply(lambda duration: timedelta(seconds=duration))

print(withChaptersAndManualTranscriptsDF.shape[0])
withChaptersAndManualTranscriptsDF.head(3)

222


Unnamed: 0,videoID,videoTitle,videoDescription,videoChapters,videoTranscripts,videoTranscriptsType,videoTranscriptDocuments,duration
7,krfUjg0S2uI,"Build a Simple Website with HTML, CSS, JavaScr...","Improve your skills in JavaScript, HTML, and C...","[[0:00:00, Intro], [0:01:26, Functional requir...",WEBVTT\n\n00:00:00.000 --> 00:00:06.800\nPract...,manual,"[page_content=""Practice your web development s...",0 days 07:12:29
9,tbqVqP5ilzQ,"JavaScript Interview Prep: Functions, Closures...",Prepare for JavaScript interview questions foc...,"[[0:00:00, Intro], [0:01:48, Function Declarat...",WEBVTT\n\n00:00:00.000 --> 00:00:04.960\nThis ...,manual,"[page_content=""This course will prepare you fo...",0 days 01:29:02
10,uRQH2CFvedY,ChatGPT Course – Use The OpenAI API to Code 5 ...,Learn how to use the OpenAI API to create five...,"[[0:00:00, Introduction], [0:02:57, Authentica...",WEBVTT\n\n00:00:00.000 --> 00:00:06.080\nWelco...,manual,"[page_content=""Welcome to this exciting video ...",0 days 05:17:49


In [303]:

c = webvtt.read_buffer(StringIO(withChaptersAndManualTranscriptsDF.videoTranscripts.iloc[1]))
c.captions[0].start

'00:00:00.000'

In [306]:
chaptersDF = None

for index in withChaptersAndManualTranscriptsDF.index:
    video = withChaptersAndManualTranscriptsDF.loc[index]
    try:
        videoChaptersDF = pd.DataFrame(
            data = video.videoChapters, 
            columns = ["startTimestamp", "chapterTitle"]
        ).drop_duplicates()
        videoChaptersDF["videoID"] = video.videoID
        videoChaptersDF["startTimestamp"] = videoChaptersDF.startTimestamp.apply(lambda timestamp: timestamp if len(timestamp.split(":")) == 3 else f"0:{timestamp}")
        videoChaptersDF["startTimestamp"] = videoChaptersDF.startTimestamp.apply(lambda timestamp: datetime.strptime(timestamp, "%H:%M:%S"))
        videoChaptersDF["startTimestamp"] = videoChaptersDF.startTimestamp.apply(lambda dt: timedelta(hours = dt.hour, minutes = dt.minute, seconds = dt.second))
        videoChaptersDF["endTimestamp"] = videoChaptersDF.startTimestamp.shift(-1)
        videoChaptersDF.iloc[-1, -1] = video.duration
        videoChaptersDF["chapterDuration"] = videoChaptersDF.endTimestamp - videoChaptersDF.startTimestamp
        if any(videoChaptersDF.chapterDuration <= timedelta(seconds = 0)):
            raise Exception("End timestamp is lower than start timestamp")
        
    except:
        videoChaptersDF = pd.DataFrame(
            data = [[None, None, video.videoID, None, None]],
            columns = ["startTimestamp", "chapterTitle", "videoID", "endTimestamp", "chapterDuration"]
        )
    
    if chaptersDF is None:
        chaptersDF = videoChaptersDF
    else:
        chaptersDF = pd.concat([chaptersDF, videoChaptersDF])
    
chaptersDF = chaptersDF.dropna().reset_index(drop = True)
print(f"Total unique videos: {len(chaptersDF.videoID.unique())}")
chaptersDF

Total unique videos: 193


Unnamed: 0,startTimestamp,chapterTitle,videoID,endTimestamp,chapterDuration
0,0 days 00:00:00,Intro,krfUjg0S2uI,0 days 00:01:26,0 days 00:01:26
1,0 days 00:01:26,Functional requirements of design,krfUjg0S2uI,0 days 00:11:31,0 days 00:10:05
2,0 days 00:11:31,Accessible form controls,krfUjg0S2uI,0 days 00:15:37,0 days 00:04:06
3,0 days 00:15:37,Update CSS custom properties with JS,krfUjg0S2uI,0 days 00:23:39,0 days 00:08:02
4,0 days 00:23:39,Screen readeronly text,krfUjg0S2uI,0 days 00:29:10,0 days 00:05:31
...,...,...,...,...,...
4714,0 days 00:04:23,Part 3 CSS,P2TcQ3h0ipQ,0 days 00:10:28,0 days 00:06:05
4715,0 days 00:10:28,Part 4 JavaScript Basic Setup,P2TcQ3h0ipQ,0 days 00:20:32,0 days 00:10:04
4716,0 days 00:20:32,Part 5 JavaScript Determine Winner,P2TcQ3h0ipQ,0 days 00:30:45,0 days 00:10:13
4717,0 days 00:30:45,Part 6 JavaScript Basic AI Winner Box,P2TcQ3h0ipQ,0 days 00:39:25,0 days 00:08:40


In [403]:
chaptersDF["chapterTranscripts"] = None

for videoID in tqdm(chaptersDF.videoID.unique()):
    try:
        videoTranscripts = withChaptersAndManualTranscriptsDF[withChaptersAndManualTranscriptsDF.videoID == videoID].iloc[0].videoTranscripts
        videoTranscripts = webvtt.read_buffer(StringIO(videoTranscripts))
        videoTranscripts = pd.DataFrame(
            data = [[
                caption.start, 
                caption.end, 
                caption.text, 
                f"{caption.start} --> {caption.end}\n{caption.text}"
            ] for caption in videoTranscripts.captions],
            columns = ["start", "end", "text", "webvtt"]
        )
        videoTranscripts["startTimedelta"] = videoTranscripts.start.apply(lambda start: datetime.strptime(start, "%H:%M:%S.%f"))
        videoTranscripts["startTimedelta"] = videoTranscripts.startTimedelta.apply(lambda dt: timedelta(hours = dt.hour, minutes = dt.minute, seconds = dt.second))
        videoTranscripts["endTimedelta"] = videoTranscripts.end.apply(lambda end: datetime.strptime(end, "%H:%M:%S.%f"))
        videoTranscripts["endTimedelta"] = videoTranscripts.endTimedelta.apply(lambda dt: timedelta(hours = dt.hour, minutes = dt.minute, seconds = dt.second))
        
        chapters = chaptersDF[chaptersDF.videoID == videoID]
        for chapterIndex in chapters.index:
            chapterInfo = chapters.loc[chapterIndex]

            chapterTranscripts = videoTranscripts[
                ((videoTranscripts.startTimedelta <= chapterInfo.startTimestamp) & (videoTranscripts.endTimedelta >= chapterInfo.endTimestamp)) |
                ((videoTranscripts.startTimedelta < chapterInfo.endTimestamp) & (videoTranscripts.startTimedelta >= chapterInfo.startTimestamp)) | 
                ((videoTranscripts.endTimedelta > chapterInfo.startTimestamp) & (videoTranscripts.endTimedelta < chapterInfo.endTimestamp)) 
            ]
            
            transcriptWebVTT = "\n\n".join(chapterTranscripts.webvtt)
            transcriptWebVTT = f"WEBVTT\n\n{transcriptWebVTT}"
            
            chaptersDF.loc[chapterIndex, "chapterTranscripts"] = transcriptWebVTT  
    except:
        continue

# Dropping videos where we could not parse WEBVTT
chaptersDF = chaptersDF[~chaptersDF.videoID.isin(
    chaptersDF[chaptersDF.chapterTranscripts.isna()].videoID.unique()
)]

chaptersDF = chaptersDF.copy(deep = True)[["videoID", "startTimestamp", "endTimestamp", "chapterDuration", "chapterTitle", "chapterTranscripts"]]
print("Total videos:", len(chaptersDF.videoID.unique()))
chaptersDF.head(3)  

Total videos: 192


Unnamed: 0,videoID,startTimestamp,endTimestamp,chapterDuration,chapterTitle,chapterTranscripts
0,krfUjg0S2uI,0 days 00:00:00,0 days 00:01:26,0 days 00:01:26,Intro,WEBVTT\n\n00:00:00.000 --> 00:00:06.800\nPract...
1,krfUjg0S2uI,0 days 00:01:26,0 days 00:11:31,0 days 00:10:05,Functional requirements of design,WEBVTT\n\n00:01:27.280 --> 00:01:32.160\nAll r...
2,krfUjg0S2uI,0 days 00:11:31,0 days 00:15:37,0 days 00:04:06,Accessible form controls,WEBVTT\n\n00:11:24.400 --> 00:11:33.600\nthat ...


In [491]:
splitter = WebVTTTextSplitter(chunk_size = 180, chunk_overlap = 10)

chapterTranscriptDocuments = []
for index in tqdm(chaptersDF.index):
    try:
        row = chaptersDF.loc[index]
        metadata = {
            "videoID": row["videoID"], 
            "chapterStartTimestamp": row["startTimestamp"], 
            "chapterEndTimestamp": row["endTimestamp"],
            "chapterTitle": row["chapterTitle"]
        }
        chapterTranscripts = row["chapterTranscripts"]
        documents = splitter.create_documents(
            [chapterTranscripts], 
            metadatas = [metadata]
        )
        chapterTranscriptDocuments.append(documents)
    except:
        chapterTranscriptDocuments.append(None)

chaptersDF["chapterTranscriptDocument"] = chapterTranscriptDocuments
chaptersDF["totalChapterTranscriptDocuments"] = chaptersDF.chapterTranscriptDocument.apply(len)
chaptersDF = chaptersDF[chaptersDF["totalChapterTranscriptDocuments"] > 0]
chaptersDF.head(3)

Unnamed: 0,videoID,startTimestamp,endTimestamp,chapterDuration,chapterTitle,chapterTranscripts,chapterTranscriptDocument,totalChapterTranscriptDocuments
0,krfUjg0S2uI,0 days 00:00:00,0 days 00:01:26,0 days 00:01:26,Intro,WEBVTT\n\n00:00:00.000 --> 00:00:06.800\nPract...,"[page_content=""Practice your web development s...",1
1,krfUjg0S2uI,0 days 00:01:26,0 days 00:11:31,0 days 00:10:05,Functional requirements of design,WEBVTT\n\n00:01:27.280 --> 00:01:32.160\nAll r...,"[page_content=""All right, so when I'm starting...",4
2,krfUjg0S2uI,0 days 00:11:31,0 days 00:15:37,0 days 00:04:06,Accessible form controls,WEBVTT\n\n00:11:24.400 --> 00:11:33.600\nthat ...,"[page_content=""that out. Okay, accessibility o...",2


In [492]:
chaptersDF.to_csv(f"{DATA_DIR}/chapters.csv", index = False, lineterminator = "\n")
with open(f"{DATA_DIR}/chapters.pkl", "wb") as f:
    f.write(pickle.dumps(chaptersDF))

In [615]:
nSamples = 1
sampledVideoIDs = chaptersDF.videoID.drop_duplicates().sample(nSamples, replace = False, random_state = 0).values
sampledVideos_chaptersDF = chaptersDF[chaptersDF.videoID.isin(sampledVideoIDs)].copy(deep = True)

sampledVideos_chaptersDF.to_csv(f"{DATA_DIR}/chapters_sample.csv", index = False, lineterminator = "\n")
with open(f"{DATA_DIR}/chapters_sample.pkl", "wb") as f:
    f.write(pickle.dumps(sampledVideos_chaptersDF))

## **Questions Generation**

In [583]:
QUESTION_GEN_TEMPLATE = """You are given transcripts of a video segment. Propose a diverse set of questions that these transcripts can answer, and also provide a rationale for choosing these questions. Provide the maximum number of questions as you can. Questions should be separated with double break lines and you should follow this format: 
Question [Question number]: [Your question here]
Justification: [Explanation of why this question can be answered based on the given transcripts]

Context: {context}

Questions and justifications: """

questionGenPrompt = PromptTemplate(template = QUESTION_GEN_TEMPLATE, input_variables = ["context"])

In [584]:
questionGenerator = LLMChain(
    prompt = questionGenPrompt,
    llm = OpenAI(model_name = "text-davinci-003", temperature = 0.5)
)

In [667]:
def getQuestionsFromDocuments(documents):
    questions = []

    for document in documents:
        result = questionGenerator.run(context = document.page_content).strip()
        try:
            entries = result.split("\n\n")
            entries = [entry.split("\n") for entry in entries if "Question" in entry and "Justification" in entry]
            entries = pd.DataFrame(data = entries, columns = ["question", "justification"])
            entries["question"] = entries.question.apply(lambda question: question.split(":")[1].strip())
            entries["justification"] = entries.justification.apply(lambda justification: justification.split(":")[1].strip())
            questions.append([{"question": entry[0], "justification": entry[1]} for entry in entries.values])
        except Exception as e:
            print("Error while parsing result:", result, e)
            questions.append(None)
    return questions

In [736]:
# Uncomment if you need to generate questions using OpenAI LLM
# chapterQuestions = []
# for documents in tqdm(sampledVideos_chaptersDF.chapterTranscriptDocument):
#     questions = getQuestionsFromDocuments(documents)
#     chapterQuestions.append(questions)

# sampledVideos_chaptersDF["questionsAndJustifications"] = chapterQuestions
# sampledVideos_chaptersDF.to_csv(f"{DATA_DIR}/chapters_sample.csv", index = False, lineterminator = "\n")
# with open(f"{DATA_DIR}/chapters_sample.pkl", "wb") as f:
#     f.write(pickle.dumps(sampledVideos_chaptersDF))

sampledVideos_chaptersDF = pickle.load(open(f"{DATA_DIR}/chapters_sample.pkl", "rb"))
sampledVideos_chaptersDF.head(3)

Unnamed: 0,videoID,startTimestamp,endTimestamp,chapterDuration,chapterTitle,chapterTranscripts,chapterTranscriptDocument,totalChapterTranscriptDocuments,questionsAndJustifications
2035,ZXdFisA_hOY,0 days 00:00:00,0 days 00:03:02,0 days 00:03:02,Introduction,WEBVTT\n\n00:00:00.089 --> 00:00:05.350\nYou a...,"[page_content=""You are about to learn how to u...",2,"[[{'question': 'What is a REST API?', 'justifi..."
2036,ZXdFisA_hOY,0 days 00:03:02,0 days 00:04:01,0 days 00:00:59,Creating the project,WEBVTT\n\n00:03:02.310 --> 00:03:06.049\nSo he...,"[page_content=""So here we are in business to r...",1,[[{'question': 'What kind of terminal are we u...
2037,ZXdFisA_hOY,0 days 00:04:01,0 days 00:10:03,0 days 00:06:02,Exploring the generated project files,WEBVTT\n\n00:03:59.180 --> 00:04:03.459\ncan a...,"[page_content=""can actually see all the genera...",3,[[{'question': 'What is the purpose of the ACS...


In [738]:
finalDataset = sampledVideos_chaptersDF.copy(deep = True)[["videoID", "startTimestamp", "endTimestamp", "questionsAndJustifications"]]
finalDataset = finalDataset.explode(["questionsAndJustifications"]).explode("questionsAndJustifications").reset_index(drop = 1)
finalDataset = pd.concat([finalDataset, finalDataset.questionsAndJustifications.apply(pd.Series)], axis = 1)
finalDataset = finalDataset.drop("questionsAndJustifications", axis = 1)

finalDataset.to_csv(f"{DATA_DIR}/finalDataset.csv")

print(finalDataset.shape)
finalDataset.head(5)

(855, 5)


Unnamed: 0,videoID,startTimestamp,endTimestamp,question,justification
0,ZXdFisA_hOY,0 days,0 days 00:03:02,What is a REST API?,This question can be answered based on the giv...
1,ZXdFisA_hOY,0 days,0 days 00:03:02,What is .NET 5?,This question can be answered based on the giv...
2,ZXdFisA_hOY,0 days,0 days 00:03:02,What do I need to follow the tutorial?,This question can be answered based on the giv...
3,ZXdFisA_hOY,0 days,0 days 00:03:02,What is the scenario used in this tutorial?,This question can be answered based on the giv...
4,ZXdFisA_hOY,0 days,0 days 00:03:02,What are the operations that can be performed ...,This question can be answered based on the giv...


In [27]:
questionTypes = finalDataset.question.apply(lambda question: " ".join(question.split(" ")[:1]))
questionTypes.value_counts()

question
What     752
How       90
Why        3
Is         3
Who        3
Where      2
Under      1
Name: count, dtype: int64

In [42]:
print(finalDataset.question.loc[questionTypes[questionTypes == "Is"].index[0]])
print(finalDataset.question.loc[questionTypes[questionTypes == "Under"].index[0]])

Is there a UI available to display the health checks?
Under which conditions is the test method being tested?
