In [32]:
import googleapiclient.discovery
import googleapiclient.errors

from keys import API_KEY
from typing import Dict, Any

api_service_name= "youtube"
api_version = "v3"

youtube = googleapiclient.discovery.build(api_service_name,
                                          api_version,
                                          developerKey=API_KEY)

def get_playlist(playlist_id: str, is_first_request=True, next_page_token=None) -> Dict[str, Any]:
    
    max_results = "50"

    if is_first_request:
        response = youtube.playlistItems().list(
            part = "snippet",
            playlistId = playlist_id,
            maxResults = max_results
        ).execute()
    else:
         response = youtube.playlistItems().list(
            part = "snippet",
            playlistId = playlist_id,
            maxResults = max_results,
            pageToken = next_page_token
             
        ).execute()
        
    
    return response
    

In [33]:
# 4 playlists of top most popular videos on youtube of all time
# https://www.youtube.com/playlist?list=PLirAqAtl_h2r5g8xGajEwdXd3x1sZh8hC

playlist_id_list = ["PLirAqAtl_h2r5g8xGajEwdXd3x1sZh8hC", "PLirAqAtl_h2o4xCWaBsDH3BKNQs8YqLCL",
                    "PLirAqAtl_h2p57Njt3QJDtwxAPZENJrIp", "PLirAqAtl_h2rTbOXU2Oc-7WBBHmFrnyUC"]
res_item_list = []

for playlist_id in playlist_id_list:
    
    response = get_playlist(playlist_id)
    res_item_list.extend(response.get('items'))
    print(playlist_id)
    
    while ('nextPageToken' in response):
        next_page_token = response.get('nextPageToken')
        print(f"current len: {len(res_item_list)}")
        response = get_playlist(playlist_id,
                                is_first_request = False,
                                next_page_token = next_page_token)
        res_item_list.extend(response.get('items'))

PLirAqAtl_h2r5g8xGajEwdXd3x1sZh8hC
current len: 50
current len: 100
current len: 150
current len: 200
current len: 250
current len: 300
current len: 350
current len: 400
current len: 450
current len: 500
PLirAqAtl_h2o4xCWaBsDH3BKNQs8YqLCL
current len: 589
current len: 639
current len: 689
current len: 739
current len: 789
current len: 839
current len: 889
current len: 939
current len: 989
current len: 1039
PLirAqAtl_h2p57Njt3QJDtwxAPZENJrIp
current len: 1094
current len: 1144
current len: 1194
current len: 1244
current len: 1294
current len: 1344
current len: 1394
current len: 1444
current len: 1494
current len: 1544
PLirAqAtl_h2rTbOXU2Oc-7WBBHmFrnyUC
current len: 1640
current len: 1690
current len: 1740
current len: 1790
current len: 1840
current len: 1890
current len: 1940
current len: 1990
current len: 2040


In [38]:
import json

# dump raw data
with open('all_raw_data.json', 'w') as f:
    json.dump(res_item_list, f, ensure_ascii=True)
    
file = open('all_raw_data.json')
data = json.load(file)
data[0]

In [43]:
# create csv
import pandas as pd

df_items = []

for res in res_item_list:
    video_id = res['snippet']['resourceId']['videoId']
    video_url = 'https://www.youtube.com/watch?v=' + video_id
    video_title = res['snippet']['title']
    
    try:
        channel_title = res['snippet']['videoOwnerChannelTitle']
    except KeyError:
        channel_title = ''
        
    try:
        channel_id = res['snippet']['videoOwnerChannelId']
    except KeyError:
        channel_id = ''
    
    df_items.append((video_id, video_url, video_title,
                     channel_title, channel_id))
    
df = pd.DataFrame(df_items, columns = ['video_id', 'video_url', 'video_title',
                                       'channel_title', 'channel_id'])
df
                  

Unnamed: 0,video_id,video_url,video_title,channel_title,channel_id
0,XqZsoesa55w,https://www.youtube.com/watch?v=XqZsoesa55w,Baby Shark Dance | #babyshark Most Viewed Vide...,Baby Shark - Pinkfong Kids’ Songs & Stories,UCcdwLMPsaU2ezNSJU1nFoBQ
1,kJQP7kiw5Fk,https://www.youtube.com/watch?v=kJQP7kiw5Fk,Luis Fonsi - Despacito ft. Daddy Yankee,LuisFonsiVEVO,UCLp8RBhQHu9wSsq62j_Md6A
2,F4tHL8reNCs,https://www.youtube.com/watch?v=F4tHL8reNCs,Johny Johny Yes Papa 👶 THE BEST Song for Child...,LooLoo Kids - Nursery Rhymes and Children's Songs,UC4NALVCmcmL5ntpV0thoH6w
3,WRVsOCh907o,https://www.youtube.com/watch?v=WRVsOCh907o,Bath Song | @CoComelon Nursery Rhymes & Kids S...,Cocomelon - Nursery Rhymes,UCbCmjCuTUZos6Inko4u57UQ
4,JGwWNGJdvx8,https://www.youtube.com/watch?v=JGwWNGJdvx8,Ed Sheeran - Shape of You (Official Music Video),Ed Sheeran,UC0C-w0YjGpqDXGB8IHb662A
...,...,...,...,...,...
2058,ruav0KvQOOg,https://www.youtube.com/watch?v=ruav0KvQOOg,Private video,,
2059,E2VCwBzGdPM,https://www.youtube.com/watch?v=E2VCwBzGdPM,Deleted video,,
2060,LYhrYHmUPn0,https://www.youtube.com/watch?v=LYhrYHmUPn0,Deleted video,,
2061,wsGDSCIPfx4,https://www.youtube.com/watch?v=wsGDSCIPfx4,Deleted video,,


In [44]:
from collections import Counter

Counter(df['video_title'])

Counter({'Private video': 43,
         'Deleted video': 19,
         'Alex Rose  ft. Cazzu, Lenny Tavarez, Lyanno & Rauw Alejandro - Toda (Remix) [Video Oficial]': 2,
         'Bath Song + More Nursery Rhymes & Kids Songs - CoComelon': 2,
         'Baby Shark Dance | #babyshark Most Viewed Video | Animal Songs | PINKFONG Songs for Children': 1,
         'Luis Fonsi - Despacito ft. Daddy Yankee': 1,
         'Johny Johny Yes Papa 👶 THE BEST Song for Children | LooLoo Kids': 1,
         'Bath Song | @CoComelon Nursery Rhymes & Kids Songs': 1,
         'Ed Sheeran - Shape of You (Official Music Video)': 1,
         'Wiz Khalifa - See You Again ft. Charlie Puth [Official Video] Furious 7 Soundtrack': 1,
         'Wheels on the Bus | @CoComelon Nursery Rhymes & Kids Songs': 1,
         'Phonics Song with TWO Words - A For Apple - ABC Alphabet Songs with Sounds for Children': 1,
         'Mark Ronson - Uptown Funk (Official Video) ft. Bruno Mars': 1,
         'PSY - GANGNAM STYLE(강남스타일) M/V'

In [55]:
df.drop_duplicates(subset=['video_id'], inplace=True)
df = df[(df['video_title'] != 'Private video') & (df['video_title'] != 'Deleted video')]
df = df.reset_index(drop=True)
df.to_csv('most_popular_videos/most_popular_1999.csv', index=False)