In [104]:
import requests
import json
import polars as pl 
from my_sk import my_key
from youtube_transcript_api import YouTubeTranscriptApi


In [105]:
import requests

API_KEY = my_key
username = "GoogleDevelopers"  # custom name

url = f"https://www.googleapis.com/youtube/v3/channels?part=id&forUsername={username}&key={API_KEY}"
response = requests.get(url)
print(response.json())

data = response.json()

# Safely extract channel ID
if data.get("items"):
    channel_id = data["items"][0]["id"]
    print("Channel ID:", channel_id)
else:
    print("Channel not found.")


{'kind': 'youtube#channelListResponse', 'etag': 'fvIlsvKuIbTm0nBlOAFcZG0oAtA', 'pageInfo': {'totalResults': 1, 'resultsPerPage': 5}, 'items': [{'kind': 'youtube#channel', 'etag': 'eRaJ6QV6wTzbBrL4eK1GacgAVQ8', 'id': 'UC_x5XG1OV2P6uZZ5FSM9Ttw'}]}
Channel ID: UC_x5XG1OV2P6uZZ5FSM9Ttw


In [106]:
url = 'https://www.googleapis.com/youtube/v3/search'
page_token = None
video_record_list = []

Extract Process

In [107]:
def getVideoRecords(response: requests.models.Response)->list:
    '''
        func to extract yt video data from GET request response
    '''

    video_record_list =[]

    for raw_item in json.loads(response.text)['items']:

        if raw_item['id']['kind'] != 'youtube#video':
            continue

        video_record = {}
        video_record['video_id']= raw_item['id']['videoId']
        video_record['datetime']=raw_item['snippet']['publishedAt']
        video_record['title']=raw_item['snippet']['title']

        video_record_list.append(video_record)

    return video_record_list


In [108]:
while page_token != 0:
    params = {
        'key': my_key,
        'channelId': channel_id,
        'part': ['snippet', 'id'],
        'order': 'date',
        'max_result': 50,
        'pageToken': page_token
    }

    response = requests.get(url, params=params)

    video_record_list += getVideoRecords(response)

    try:
        page_token = json.loads(response.text)['nextPageToken']
    except:
        page_token = 0

In [109]:
video_record_list

[{'video_id': 'N1WMctGbz-0',
  'datetime': '2025-05-12T19:00:15Z',
  'title': 'The countdown to #GoogleIO is on!'},
 {'video_id': '8kkqcevLYis',
  'datetime': '2025-05-12T16:00:22Z',
  'title': 'What will the last comparison print? What happens if we change the strings to contain numbers? Go!'},
 {'video_id': '-IThZH0rStY',
  'datetime': '2025-04-24T20:21:02Z',
  'title': 'Cloud Stage - Day 2 (Google I/O 2025)'},
 {'video_id': 'jqEmWBQJon4',
  'datetime': '2025-04-24T20:15:08Z',
  'title': 'AI Stage - Day 2 (Google I/O 2025)'},
 {'video_id': 'XASRYAR7S5w',
  'datetime': '2025-04-24T20:16:26Z',
  'title': 'Web Stage - Day 2 (Google I/O 2025)'},
 {'video_id': 'GHACUmp2GZc',
  'datetime': '2025-04-24T20:12:45Z',
  'title': 'Android Stage - Day 2 (Google I/O 2025)'},
 {'video_id': '4pB2IhK_zWo',
  'datetime': '2025-04-24T19:53:41Z',
  'title': 'Web Stage - Day 1 (Google I/O 2025)'},
 {'video_id': 'zwosRCbMD3w',
  'datetime': '2025-04-24T19:55:26Z',
  'title': 'Cloud Stage - Day 1 (Google I

In [110]:
df = pl.DataFrame(video_record_list)
print(df.head)

<bound method DataFrame.head of shape: (85, 3)
┌─────────────┬──────────────────────┬─────────────────────────────────┐
│ video_id    ┆ datetime             ┆ title                           │
│ ---         ┆ ---                  ┆ ---                             │
│ str         ┆ str                  ┆ str                             │
╞═════════════╪══════════════════════╪═════════════════════════════════╡
│ N1WMctGbz-0 ┆ 2025-05-12T19:00:15Z ┆ The countdown to #GoogleIO is … │
│ 8kkqcevLYis ┆ 2025-05-12T16:00:22Z ┆ What will the last comparison … │
│ -IThZH0rStY ┆ 2025-04-24T20:21:02Z ┆ Cloud Stage - Day 2 (Google I/… │
│ jqEmWBQJon4 ┆ 2025-04-24T20:15:08Z ┆ AI Stage - Day 2 (Google I/O 2… │
│ XASRYAR7S5w ┆ 2025-04-24T20:16:26Z ┆ Web Stage - Day 2 (Google I/O … │
│ …           ┆ …                    ┆ …                               │
│ 65HoWqBboI8 ┆ 2019-01-15T18:07:28Z ┆ AbsorbPointer (Flutter Widget … │
│ IYDVcriKjsw ┆ 2019-01-08T18:04:52Z ┆ LayoutBuilder (Flutter Widget … │
│ T4

In [111]:
def extract_text(transcript: list)->str:
    '''
        func to extract tex tfrom transcript dictionary
    '''

    text_list = []
    for i in range(len(transcript)):
        text_list.append(i['text'])
    
    return ' '.join(text_list)


In [112]:
transcropt_text_list = []

for i in range(len(df)):

    try:
        transcript = YouTubeTranscriptApi.get_transcript(df['video_id'][i])
        transcript_text = extract_text(transcript)

    except:
        transcript_text = 'n/a'

    transcropt_text_list.append(transcript_text)

In [113]:
df = df.with_columns(pl.Series(name='transcript', values=transcropt_text_list))
print(df.head)

<bound method DataFrame.head of shape: (85, 4)
┌─────────────┬──────────────────────┬─────────────────────────────────┬────────────┐
│ video_id    ┆ datetime             ┆ title                           ┆ transcript │
│ ---         ┆ ---                  ┆ ---                             ┆ ---        │
│ str         ┆ str                  ┆ str                             ┆ str        │
╞═════════════╪══════════════════════╪═════════════════════════════════╪════════════╡
│ N1WMctGbz-0 ┆ 2025-05-12T19:00:15Z ┆ The countdown to #GoogleIO is … ┆ n/a        │
│ 8kkqcevLYis ┆ 2025-05-12T16:00:22Z ┆ What will the last comparison … ┆ n/a        │
│ -IThZH0rStY ┆ 2025-04-24T20:21:02Z ┆ Cloud Stage - Day 2 (Google I/… ┆ n/a        │
│ jqEmWBQJon4 ┆ 2025-04-24T20:15:08Z ┆ AI Stage - Day 2 (Google I/O 2… ┆ n/a        │
│ XASRYAR7S5w ┆ 2025-04-24T20:16:26Z ┆ Web Stage - Day 2 (Google I/O … ┆ n/a        │
│ …           ┆ …                    ┆ …                               ┆ …          │
│ 65HoW

Transform

In [114]:
#removing duplicates
df = df.unique(subset=["video_id"], keep="first")  


In [115]:
print('Shape: ',df.shape)
print('unique rows: ', df.n_unique)

for i in range(df.shape[1]):
    print(f'unique elements ({df.columns[i]}), {df[:, i].n_unique()}')

Shape:  (83, 4)
unique rows:  <bound method DataFrame.n_unique of shape: (83, 4)
┌─────────────┬──────────────────────┬─────────────────────────────────┬────────────┐
│ video_id    ┆ datetime             ┆ title                           ┆ transcript │
│ ---         ┆ ---                  ┆ ---                             ┆ ---        │
│ str         ┆ str                  ┆ str                             ┆ str        │
╞═════════════╪══════════════════════╪═════════════════════════════════╪════════════╡
│ ww7RItTgO48 ┆ 2022-05-11T20:25:51Z ┆ Developer Keynote (Google I/O … ┆ n/a        │
│ lyRPyRKHO8M ┆ 2019-05-07T21:57:34Z ┆ Google Keynote (Google I/O&#39… ┆ n/a        │
│ 9z_YNlRlWfA ┆ 2019-01-22T18:43:48Z ┆ Transform (Flutter Widget of t… ┆ n/a        │
│ iR2O2GPbB0E ┆ 2023-05-05T23:00:18Z ┆ What are Large Language Models… ┆ n/a        │
│ dYRs7Q1vfYI ┆ 2019-01-29T21:57:23Z ┆ BackdropFilter (Flutter Widget… ┆ n/a        │
│ …           ┆ …                    ┆ …                   

In [116]:
#changing the datetime to Datetime dtype
df = df.with_columns(pl.col('datetime').cast(pl.Datetime))
df.head

<bound method DataFrame.head of shape: (83, 4)
┌─────────────┬─────────────────────┬─────────────────────────────────┬────────────┐
│ video_id    ┆ datetime            ┆ title                           ┆ transcript │
│ ---         ┆ ---                 ┆ ---                             ┆ ---        │
│ str         ┆ datetime[μs]        ┆ str                             ┆ str        │
╞═════════════╪═════════════════════╪═════════════════════════════════╪════════════╡
│ ww7RItTgO48 ┆ 2022-05-11 20:25:51 ┆ Developer Keynote (Google I/O … ┆ n/a        │
│ lyRPyRKHO8M ┆ 2019-05-07 21:57:34 ┆ Google Keynote (Google I/O&#39… ┆ n/a        │
│ 9z_YNlRlWfA ┆ 2019-01-22 18:43:48 ┆ Transform (Flutter Widget of t… ┆ n/a        │
│ iR2O2GPbB0E ┆ 2023-05-05 23:00:18 ┆ What are Large Language Models… ┆ n/a        │
│ dYRs7Q1vfYI ┆ 2019-01-29 21:57:23 ┆ BackdropFilter (Flutter Widget… ┆ n/a        │
│ …           ┆ …                   ┆ …                               ┆ …          │
│ zwosRCbMD3w ┆ 20