# ETL Noteboook

In [3]:
import requests
import json
import pandas as pd
from datetime import datetime

from youtube_transcript_api import YouTubeTranscriptApi
import os



### Utility Functions

In [4]:
def getVideoRecords(response: requests.models.Response) -> list:
    """
        Function to extract YouTube video data from GET request response
    """

    video_record_list = []
    
    for raw_item in json.loads(response.text)['items']:
    
        # only execute for youtube videos (i.e. Don't include shorts)
        if raw_item['id']['kind'] != "youtube#video":
            continue
        
        video_record = {}
        video_record['video_id'] = raw_item['id']['videoId']
        video_record['datetime'] = raw_item['snippet']['publishedAt']
        video_record['title'] = raw_item['snippet']['title']
        
        video_record_list.append(video_record)

    return video_record_list

In [5]:
def extractTranscript(transcript: list) -> str:
    """
        Function to extract video transcript text
    """
    
    text_list = [transcript[i]['text'] for i in range(len(transcript))]
    return ' '.join(text_list)

### Extract Sylve Von-Duuglas Ittu's YouTube Video Record Data

In [6]:
# Define fixed parameters for Youtube API requests
channel_id = "UCgFe05f-DrPpaunE4Gaz3cQ"   # Sylvie von Duuglas-Ittu's Channel ID
yt_search_url = 'https://www.googleapis.com/youtube/v3/search'    # Define youtube search API url
yt_api_key = os.getenv('YT_API_KEY')    # Define personal youtube API access key

# Initialize page token & list in which to store video data
page_token = None
video_record_list = []

In [7]:
# Extract video data for API search results
while page_token!=0:
    # define API request parameters
    params = {'key' : yt_api_key, 'channelId' : channel_id,
              'part' : ["snippet", "id"], 'order' : "date",
              'maxResults' : 50, 'pageToken' : page_token}
    
    # Make API request
    response = requests.get(yt_search_url, params=params)
    video_record_list += getVideoRecords(response)

    try:
        # Get next page token
        page_token = json.loads(response.text)["nextPageToken"]
    except:
        # If no next page token, kill while loop
        page_token = 0

In [8]:
# Store video record data in a Pandas DataFrame
df = pd.DataFrame(video_record_list)
df.head()

Unnamed: 0,video_id,datetime,title
0,S2rqRaL1P8w,2024-12-03T02:48:10Z,What Was It Like Boxing After So Many Muay Tha...
1,jquPsuLggI0,2024-11-30T07:52:58Z,Post Fight Update - 1st &quot;Official&quot; B...
2,J2b5ziNb2Mg,2024-11-29T03:17:38Z,Fight 177 - Sylvie vs Nantida Sitweerachat wit...
3,5kFbuiLXvsU,2024-11-20T09:56:40Z,"Vlog - dogs, treats, and my favorite old lady."
4,lswgCMqcOZQ,2024-11-12T03:54:44Z,"Boxing, Sparring, Breast Tissue Health, Gettin..."


### Extract Video Transcript Text

In [9]:
# Initialize a list to store video captions
transcript_text_list = []

# Loop through each row of videos dataframe
for i in range(len(df)):
    # Try to extract captions
    try:
        # get transcript
        transcript = YouTubeTranscriptApi.get_transcript(df['video_id'][i])
        transcript_text = extractTranscript(transcript)
    except:
        # If no captions available set transcript text to "n/a"
        transcript_text = "n/a"
    # Append transcript text to list
    transcript_text_list.append(transcript_text)

In [10]:
# Add video transcripts to dataframe
df['transcript'] = pd.Series(transcript_text_list)
df.head()

Unnamed: 0,video_id,datetime,title,transcript
0,S2rqRaL1P8w,2024-12-03T02:48:10Z,What Was It Like Boxing After So Many Muay Tha...,howdy welcome to my run cast I'm so excited be...
1,jquPsuLggI0,2024-11-30T07:52:58Z,Post Fight Update - 1st &quot;Official&quot; B...,hey so I am outside of the Tai payak gym here ...
2,J2b5ziNb2Mg,2024-11-29T03:17:38Z,Fight 177 - Sylvie vs Nantida Sitweerachat wit...,this is a fight up in nong bua cope which is k...
3,5kFbuiLXvsU,2024-11-20T09:56:40Z,"Vlog - dogs, treats, and my favorite old lady.",so I just had to explain to my little dog frie...
4,lswgCMqcOZQ,2024-11-12T03:54:44Z,"Boxing, Sparring, Breast Tissue Health, Gettin...",howdy howdy welcome to my runcast it's been a ...


### Transform Process : Clean data & transform it into a usable format for the ML Model

In [11]:
# Check for duplicates
print("shape : ", df.shape)
print("num unqiue rows : ", df.shape[0] - sum(df.duplicated()))
for column in df.nunique().index:
    print(f"num unique entries in {column} column : ", df.nunique()[column])

shape :  (500, 4)
num unqiue rows :  500
num unique entries in video_id column :  500
num unique entries in datetime column :  500
num unique entries in title column :  500
num unique entries in transcript column :  414


In [20]:
# Reformat datetime column
df['datetime'] = df.datetime.apply(lambda x : datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ'))

In [34]:
# Replace each strings that appears in title and transcript columns
string_replacements = {'&quot;' : '"', '&#39;' : "'", "[Music]" : ""}

for string in string_replacements.keys():
    df['title'] = df.title.apply(lambda x : x.replace(string, string_replacements[string]))
    df['transcript'] = df.transcript.apply(lambda x : x.replace(string, string_replacements[string]))

In [35]:
df.head()

Unnamed: 0,video_id,datetime,title,transcript
0,S2rqRaL1P8w,2024-12-03 02:48:10,What Was It Like Boxing After So Many Muay Tha...,howdy welcome to my run cast I'm so excited be...
1,jquPsuLggI0,2024-11-30 07:52:58,"Post Fight Update - 1st ""Official"" Boxing Fight",hey so I am outside of the Tai payak gym here ...
2,J2b5ziNb2Mg,2024-11-29 03:17:38,Fight 177 - Sylvie vs Nantida Sitweerachat wit...,this is a fight up in nong bua cope which is k...
3,5kFbuiLXvsU,2024-11-20 09:56:40,"Vlog - dogs, treats, and my favorite old lady.",so I just had to explain to my little dog frie...
4,lswgCMqcOZQ,2024-11-12 03:54:44,"Boxing, Sparring, Breast Tissue Health, Gettin...",howdy howdy welcome to my runcast it's been a ...


### Load Process : Load the clean & transformed data into the project's 'data' directory

In [37]:
# Save data as parquet file
df.to_parquet('data/video-transcripts.parquet')