# ETL Notebook

In [1]:
import requests
import json
import pandas as pd

# Import YT API
from secret_keys import yt_api_key
from youtube_transcript_api import YouTubeTranscriptApi



### Utility Functions

In [2]:
def getVideoInfo(response):
    """
    """
    page_items = json.loads(response.content)['items']

    video_info_list = []

    for item in page_items:
        if item['id']['kind'] == 'youtube#video':
            video_id = item['id']['videoId']
            upload_date = item['snippet']['publishedAt']
            video_title = item['snippet']['title']

            video_info_list.append({'video_id' : video_id, 'video_title' : video_title, 'upload_date' : upload_date})

    return video_info_list



### Extract Process : Extract Title & Transcript Data of RWS's YT Videos

In [3]:
# Define channel ID of RWS's Youtube Channel
channel_id = "UCXYHvmfsgeS19pjIXlyCryQ"

# Define Youtube Search API URL
yt_api_url = "https://www.googleapis.com/youtube/v3/search"

# Initialize list to store video data & page token
video_info_list = []
page_token = None

In [5]:
# Extract video data from all search result pages
while page_token != 0:
    # define parameters for Youtube API request
    params = {'key' : yt_api_key, 'channelId' : channel_id,
              'part' : ["snippet", "id"], 'order' : "date",
              'maxResults' : 50, 'pageToken' : page_token}
    
    # Make API request
    response = requests.get(yt_api_url, params=params)
    video_info_list += getVideoInfo(response)

    # Make API request for next page if next page exists
    try:
        page_token = json.loads(response.text)["nextPageToken"]
    except:
        page_token = 0

In [6]:
# Store response data in a DataFrame
df = pd.DataFrame(video_info_list)
df.head()

Unnamed: 0,video_id,video_title,upload_date
0,9sk2Vf9Eakg,AGE IS JUST A NUMBER! üí™üèª,2024-11-19T12:25:43Z
1,Tk4H8kZh0KI,Full Fight l Yonis Anane Venum Muay Thai vs Ja...,2024-11-17T04:30:12Z
2,4SnTIX075hk,Full Fight l Joe Ryan 2 Brothers Gym vs Petchm...,2024-11-17T04:15:00Z
3,mCpGEYufiEA,Full Fight l Khunsueklek Boomdeksian vs Kevin ...,2024-11-17T04:00:25Z
4,Adrbh6bz0_k,Full Fight l Domthong Lookjaoporrongtom vs Boo...,2024-11-17T03:45:00Z


In [12]:
def getVideoTranscript(video_id : str):
    """   
    """
    try:
        # get transcript
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        transcript_text = ''
        for item in transcript:
            transcript_text += item['text'] + ''

    except:
        # If no captions available set transcript text to "n/a"
        transcript_text = "n/a"

    return transcript_text

In [14]:
### Add video transcripts to Video Data dataframe
df['transcript'] = df['video_id'].apply(lambda x : getVideoTranscript(x))
df.head()

Unnamed: 0,video_id,video_title,upload_date,transcript
0,9sk2Vf9Eakg,AGE IS JUST A NUMBER! üí™üèª,2024-11-19T12:25:43Z,
1,Tk4H8kZh0KI,Full Fight l Yonis Anane Venum Muay Thai vs Ja...,2024-11-17T04:30:12Z,this isR introducing your referee Mr[Music]P a...
2,4SnTIX075hk,Full Fight l Joe Ryan 2 Brothers Gym vs Petchm...,2024-11-17T04:15:00Z,this isR introducing your referee Mr Narinpong...
3,mCpGEYufiEA,Full Fight l Khunsueklek Boomdeksian vs Kevin ...,2024-11-17T04:00:25Z,this isr and introduce your referee Mr Pyon[Mu...
4,Adrbh6bz0_k,Full Fight l Domthong Lookjaoporrongtom vs Boo...,2024-11-17T03:45:00Z,this isR introducing your referee[Music]Mr and...


### Transform Process : Clean Data & Transform Into An Appropriate Format