In [1]:
import pandas as pd
from googleapiclient.discovery import build
import datetime
import progressbar
from pathlib import Path
import isodate
from IPython.display import JSON

In [2]:
api_key = open('API_KEY.txt', 'r').readline()

# Getting top finance channels

In [3]:
api_service_name = "youtube"
api_version = "v3"

youtube = build(api_service_name, api_version, developerKey=api_key)

channels_id = [
    "UCV6KDgJskWaEckne5aPA0aQ", # GrahamStephan
    "UCGy7SkBjcIAgTiwkXEtPnYg" # AndreiJikh (No personal stuff putting you second Andrei)
]

channels_info = []

request = youtube.channels().list(
    part = "snippet,contentDetails,statistics",
    id=','.join(channels_id)
)
response = request.execute()

for channel_responde in response["items"]:
    ch_info = {
        "channelName": channel_responde["snippet"]["title"],
        "startDate": channel_responde["snippet"]["publishedAt"],
        "relatedPlaylist": channel_responde["contentDetails"]["relatedPlaylists"]["uploads"],
        "totalViews": int(channel_responde["statistics"]["viewCount"]),
        "subscriberCount": int(channel_responde["statistics"]["subscriberCount"]),
        "videoCount": int(channel_responde["statistics"]["videoCount"]),
        "extractionDate": datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d")
    }
    
    channels_info.append(ch_info)

channelsInfo = pd.DataFrame(channels_info)

JSON(response)

<IPython.core.display.JSON object>

In [4]:
channelsInfo.head()

Unnamed: 0,channelName,startDate,relatedPlaylist,totalViews,subscriberCount,videoCount,extractionDate
0,Andrei Jikh,2017-03-24T00:43:42Z,UUGy7SkBjcIAgTiwkXEtPnYg,202590112,2090000,431,2022-07-28
1,Graham Stephan,2016-12-25T07:48:56Z,UUV6KDgJskWaEckne5aPA0aQ,396172600,3910000,807,2022-07-28


In [5]:
def get_videos_from_playlist(playlistId):
    request = youtube.playlistItems().list(
        part = "contentDetails",
        playlistId = playlistId,
        maxResults = 50
    )
    response = request.execute()

    videoId_list = []
    for video_response in response["items"]:
        videoId_list.append(video_response["contentDetails"]["videoId"])

    next_page_token = response.get('nextPageToken')

    while next_page_token is not None:
        request = youtube.playlistItems().list(
            part = "contentDetails",
            playlistId = playlistId,
            maxResults = 50,
            pageToken = next_page_token
        )
        response = request.execute()

        for video_response in response["items"]:
            videoId_list.append(video_response["contentDetails"]["videoId"])

        next_page_token = response.get('nextPageToken')
    
    return videoId_list

# Extracting individual videoId for every youtuber

In [6]:
channel_videos = {}
for ch_name in channelsInfo.channelName:
    playlistId = channelsInfo[channelsInfo.channelName == ch_name].relatedPlaylist.values[0]
    channel_videos[ch_name] = get_videos_from_playlist(playlistId)

## Validating if the extraction works out completely 

In [7]:
for ch_name in channelsInfo.channelName:
    assert channelsInfo[channelsInfo.channelName == ch_name].videoCount.values[0] == len(channel_videos[ch_name]), \
        "Extraction doesn't match"

In [8]:
full_video_list = []
for channel_video_list in channel_videos.values():
    full_video_list = full_video_list + channel_video_list

print(f"The full video list contains: {len(full_video_list)} [ids]")

The full video list contains: 1238 [ids]


# Extracting all video stats

In [9]:
def partition(lst, size):
    for i in range(0, len(lst), size):
        yield lst[i : i+size]

to_extract = {
    "snippet": ["channelTitle", "title", "description", "tags", "publishedAt"],
    "contentDetails": ["duration", "definition", "caption"],
    "statistics": ["viewCount", "likeCount", "favoriteCount", "commentCount"],
}

bar = progressbar.ProgressBar(maxval=len(full_video_list), \
        widgets=[progressbar.Bar(u"#", '[', ']'), ' ', progressbar.Percentage()])
bar.start()

videoInfo_list = []
for video_list_partition in list(partition(full_video_list, 50)):
    request = youtube.videos().list(
        part = "snippet,contentDetails,statistics",
        id = ",".join(video_list_partition),
        maxResults = 50
    )
    response = request.execute()
    
    
    for idx, video_response in enumerate(response["items"]):
        videoInfo_dict = {}
        videoInfo_dict["videoId"] = video_response["id"]
        for k in to_extract.keys():
            for v in to_extract[k]:
                try:
                    videoInfo_dict[v] = video_response[k][v]
                except KeyError:
                    videoInfo_dict[v] = None
        videoInfo_list.append(videoInfo_dict)
        bar.update(idx+1)

bar.finish()

videos_df = pd.DataFrame(videoInfo_list)

[########################################################################] 100%


In [10]:
numeric_columns = ["viewCount", "likeCount", "favoriteCount", "commentCount"]
videos_df[numeric_columns] = videos_df[numeric_columns].apply(pd.to_numeric, errors = "coerce", axis = 1)

In [11]:
videos_df["publishedAt"] = pd.to_datetime(videos_df["publishedAt"])
videos_df["publishedAtWeekday"] = videos_df["publishedAt"].map(lambda x: x.strftime("%A"))
videos_df["publishedAtDay"] = videos_df["publishedAt"].map(lambda x: str(x)[:10])
videos_df["publishedAtHour"] = videos_df["publishedAt"].dt.hour

In [12]:
videos_df["duration"] = videos_df["duration"].map(lambda x: isodate.parse_duration(x))
videos_df["duration"] = videos_df["duration"].astype("timedelta64[s]")

In [13]:
print(videos_df.shape)
display(videos_df.head())

(1238, 16)


Unnamed: 0,videoId,channelTitle,title,description,tags,publishedAt,duration,definition,caption,viewCount,likeCount,favoriteCount,commentCount,publishedAtWeekday,publishedAtDay,publishedAtHour
0,-8FuLG8kMe4,Andrei Jikh,Time Is Money #shorts,► Get up to a $250 in Digital Currency: https:...,"[how to invest, bitcoin, million, ethereum, ma...",2022-07-27 22:15:00+00:00,41.0,hd,False,14122.0,455.0,0.0,68.0,Wednesday,2022-07-27,22
1,dli7Ygf3bA0,Andrei Jikh,The TRUTH About Recessions #shorts,► Get up to a $250 in Digital Currency: https:...,"[how to invest, bitcoin, million, ethereum, ma...",2022-07-26 22:15:01+00:00,40.0,hd,False,21164.0,821.0,0.0,47.0,Tuesday,2022-07-26,22
2,wPMF1Px33Z8,Andrei Jikh,Prepare For The Recession | THIS WEEK,White House just released a statement. Prepare...,"[how to invest, bitcoin, million, ethereum, ma...",2022-07-25 22:30:10+00:00,711.0,hd,False,325114.0,16684.0,0.0,2682.0,Monday,2022-07-25,22
3,BvHLXY0Tsas,Andrei Jikh,How Bad The Stock Market Will Get #shorts,► Get up to a $250 in Digital Currency: https:...,"[how to invest, bitcoin, million, ethereum, ma...",2022-07-24 22:15:00+00:00,51.0,hd,False,48931.0,1619.0,0.0,74.0,Sunday,2022-07-24,22
4,ZsBw1S3s0QY,Andrei Jikh,Real Estate Just Went Crazy #shorts,► Get up to a $250 in Digital Currency: https:...,"[how to invest, bitcoin, million, ethereum, ma...",2022-07-23 22:15:00+00:00,38.0,hd,False,44342.0,1071.0,0.0,105.0,Saturday,2022-07-23,22


In [14]:
videos_df["tags"] = videos_df["tags"].fillna("[]")

In [15]:
cwd = Path().absolute()
output_dir = f"{cwd}/outputs"
Path(output_dir).mkdir(parents = True, exist_ok = True)

videos_df.to_csv(f"{output_dir}/videoInfo.csv", index = False)