In [1]:
import pandas as pd
from googleapiclient.discovery import build
import datetime
import progressbar
from pathlib import Path
import isodate
from IPython.display import JSON

In [2]:
api_key = open('API_KEY.txt', 'r').readline()

# Getting top finance channels

In [3]:
api_service_name = "youtube"
api_version = "v3"

youtube = build(api_service_name, api_version, developerKey=api_key)

channels_id = [
    "UCV6KDgJskWaEckne5aPA0aQ", # GrahamStephan
    "UCGy7SkBjcIAgTiwkXEtPnYg" # AndreiJikh (No personal stuff putting you second Andrei)
]

channels_info = []

request = youtube.channels().list(
    part = "snippet,contentDetails,statistics",
    id=','.join(channels_id)
)
response = request.execute()

for channel_responde in response["items"]:
    ch_info = {
        "channelName": channel_responde["snippet"]["title"],
        "startDate": channel_responde["snippet"]["publishedAt"],
        "relatedPlaylist": channel_responde["contentDetails"]["relatedPlaylists"]["uploads"],
        "totalViews": int(channel_responde["statistics"]["viewCount"]),
        "subscriberCount": int(channel_responde["statistics"]["subscriberCount"]),
        "videoCount": int(channel_responde["statistics"]["videoCount"]),
        "extractionDate": datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d")
    }
    
    channels_info.append(ch_info)

channelsInfo = pd.DataFrame(channels_info)

JSON(response)

<IPython.core.display.JSON object>

In [4]:
channelsInfo.head()

Unnamed: 0,channelName,startDate,relatedPlaylist,totalViews,subscriberCount,videoCount,extractionDate
0,Graham Stephan,2016-12-25T07:48:56Z,UUV6KDgJskWaEckne5aPA0aQ,391018385,3880000,799,2022-07-11
1,Andrei Jikh,2017-03-24T00:43:42Z,UUGy7SkBjcIAgTiwkXEtPnYg,198416221,2070000,414,2022-07-11


In [5]:
def get_videos_from_playlist(playlistId):
    request = youtube.playlistItems().list(
        part = "contentDetails",
        playlistId = playlistId,
        maxResults = 50
    )
    response = request.execute()

    videoId_list = []
    for video_response in response["items"]:
        videoId_list.append(video_response["contentDetails"]["videoId"])

    next_page_token = response.get('nextPageToken')

    while next_page_token is not None:
        request = youtube.playlistItems().list(
            part = "contentDetails",
            playlistId = playlistId,
            maxResults = 50,
            pageToken = next_page_token
        )
        response = request.execute()

        for video_response in response["items"]:
            videoId_list.append(video_response["contentDetails"]["videoId"])

        next_page_token = response.get('nextPageToken')
    
    return videoId_list

# Extracting individual videoId for every youtuber

In [6]:
channel_videos = {}
for ch_name in channelsInfo.channelName:
    playlistId = channelsInfo[channelsInfo.channelName == ch_name].relatedPlaylist.values[0]
    channel_videos[ch_name] = get_videos_from_playlist(playlistId)

## Validating if the extraction works out completely 

In [7]:
for ch_name in channelsInfo.channelName:
    assert channelsInfo[channelsInfo.channelName == ch_name].videoCount.values[0] == len(channel_videos[ch_name]), \
        "Extraction doesn't match"

In [8]:
full_video_list = []
for channel_video_list in channel_videos.values():
    full_video_list = full_video_list + channel_video_list

print(f"The full video list contains: {len(full_video_list)} [ids]")

The full video list contains: 1213 [ids]


# Extracting all video stats

In [9]:
def partition(lst, size):
    for i in range(0, len(lst), size):
        yield lst[i : i+size]

to_extract = {
    "snippet": ["channelTitle", "title", "description", "tags", "publishedAt"],
    "contentDetails": ["duration", "definition", "caption"],
    "statistics": ["viewCount", "likeCount", "favoriteCount", "commentCount"],
}

bar = progressbar.ProgressBar(maxval=len(full_video_list), \
        widgets=[progressbar.Bar(u"#", '[', ']'), ' ', progressbar.Percentage()])
bar.start()

videoInfo_list = []
for video_list_partition in list(partition(full_video_list, 50)):
    request = youtube.videos().list(
        part = "snippet,contentDetails,statistics",
        id = ",".join(video_list_partition),
        maxResults = 50
    )
    response = request.execute()
    
    
    for idx, video_response in enumerate(response["items"]):
        videoInfo_dict = {}
        videoInfo_dict["videoId"] = video_response["id"]
        for k in to_extract.keys():
            for v in to_extract[k]:
                try:
                    videoInfo_dict[v] = video_response[k][v]
                except KeyError:
                    videoInfo_dict[v] = None
        videoInfo_list.append(videoInfo_dict)
        bar.update(idx+1)

bar.finish()

videos_df = pd.DataFrame(videoInfo_list)

[########################################################################] 100%


In [10]:
numeric_columns = ["viewCount", "likeCount", "favoriteCount", "commentCount"]
videos_df[numeric_columns] = videos_df[numeric_columns].apply(pd.to_numeric, errors = "coerce", axis = 1)

In [11]:
videos_df["publishedAt"] = pd.to_datetime(videos_df["publishedAt"])
videos_df["publishedAtWeekday"] = videos_df["publishedAt"].map(lambda x: x.strftime("%A"))
videos_df["publishedAtDay"] = videos_df["publishedAt"].map(lambda x: str(x)[:10])
videos_df["publishedAtHour"] = videos_df["publishedAt"].dt.hour

In [12]:
videos_df["duration"] = videos_df["duration"].map(lambda x: isodate.parse_duration(x))
videos_df["duration"] = videos_df["duration"].astype("timedelta64[s]")

In [13]:
print(videos_df.shape)
display(videos_df.head())

(1213, 16)


Unnamed: 0,videoId,channelTitle,title,description,tags,publishedAt,duration,definition,caption,viewCount,likeCount,favoriteCount,commentCount,publishedAtWeekday,publishedAtDay,publishedAtHour
0,IQe4LqIODoA,Graham Stephan,The Housing Market Bubble Just Popped,Lets talk about the current state of the housi...,"[investing, investing for beginners, investing...",2022-07-08 22:00:14+00:00,727.0,hd,False,400548.0,18638.0,0.0,1760.0,Friday,2022-07-08,22
1,Qdwy6IyZVTc,Graham Stephan,An URGENT Warning For ALL Crypto Investors,Thanks to Mine for Sponsoring: Find out which ...,"[investing, investing for beginners, investing...",2022-07-06 21:30:02+00:00,794.0,hd,False,334391.0,14541.0,0.0,1425.0,Wednesday,2022-07-06,21
2,qYHVv3YwHMc,Graham Stephan,The Mother Of All Crashes Is Coming | Michael ...,"GET UP TO $100 OF FREE CRYPTO: Trade Bitcoin, ...","[investing, investing for beginners, investing...",2022-07-01 20:00:12+00:00,775.0,hd,False,672772.0,29218.0,0.0,1600.0,Friday,2022-07-01,20
3,YGq_o-p_i8I,Graham Stephan,The Car Market Bubble Just Popped,Enter at https://www.omaze.com/stephan for you...,"[investing, investing for beginners, investing...",2022-06-29 21:00:14+00:00,782.0,hd,False,1122553.0,46614.0,0.0,2796.0,Wednesday,2022-06-29,21
4,d2h5RWDPIcY,Graham Stephan,The Middle Class Just Got FINANCIALLY RUINED,Click Here To Try Out TrueBill! http://www.tru...,"[investing, investing for beginners, investing...",2022-06-27 22:30:04+00:00,826.0,hd,False,327368.0,18954.0,0.0,1672.0,Monday,2022-06-27,22


In [14]:
cwd = Path().absolute()
output_dir = f"{cwd}/outputs"
Path(output_dir).mkdir(parents = True, exist_ok = True)

videos_df.to_csv(f"{output_dir}/videoInfo.csv", index = False)