# YouTube Data API - scrapping trailer videos from producer channels

In [1]:
# Import libraries
from googleapiclient.discovery import build
import pandas as pd

In [None]:
api_key = 'ENTERYOURAPIKEY' #  YouTube-API-key

channel_ids = ['UC2-BeLxzUBSs0uSrmzWhJuQ', # 20th Century Studios
                'UC6pGDc4bFGD1_36IKv3FnYg', # Crunchyroll Collection
                'UChCUEwJrrG01_L7w4Bh1xqA', # Disney Entertainment Studios
                'UCuaFvcY4MhZY3U43mMt1dYQ', # Walt Disney Studios
                'UCGoLJHggXySEzxaEOaDZDbQ', # Focus Entertainment
                'UCO7rKYuE7EMHYFcwP-obqhg', # Gravitas Ventures
                'UCJ6nMHaJPZvsJ-HmUmj1SeA', # Lionsgate Movies
                'UCf5CjDJvsFvtVIhkfmKAwAA', # MGM
                'UCt4Xdx38tkV5_3IkOJAFw-Q', # Orion Pictures
                'UCF9imwPMSGz4Vq1NiTWCC7g', # Paramount Pictures
                'UCor9rW6PgxSQ9vUPWQdnaYQ', # SearchlightPictures
                'UCz97F7dMxBNOfGYu3rx8aCw', # Sony Pictures Entertainment
                'UCq0OueAsdxH6b8nyAspwViw', # Universal Pictures
                'UCjmJDM5pRKbUlVIzDYYWb6g', # Warner Bros. Pictures
                'UCvC4D8onUfXzvjTOM-dBfEA'  # Marvel Entertainment
              ]

youtube = build('youtube', 'v3', developerKey=api_key)

## Pulling channel statistics¹

#### (1) Adaptation from the original code authored by Thoufiq and available in https://techtfq.com/video/python-project-to-scrape-youtube-using-youtube-data-api

In [3]:
def get_channel_stats(youtube, channel_ids):
    all_data = []
    request = youtube.channels().list(
                part='snippet,contentDetails,statistics',
                id=','.join(channel_ids))
    response = request.execute() 
    
    for i in range(len(response['items'])):
        data = dict(Channel_name = response['items'][i]['snippet']['title'],
                    Channel_id = response['items'][i]['id'],
                    Subscribers = response['items'][i]['statistics']['subscriberCount'],
                    Views = response['items'][i]['statistics']['viewCount'],
                    Total_videos = response['items'][i]['statistics']['videoCount'],
                    playlist_id = response['items'][i]['contentDetails']['relatedPlaylists']['uploads'])
        all_data.append(data)
    
    return all_data

In [4]:
channel_statistics = get_channel_stats(youtube, channel_ids)

In [5]:
channel_data = pd.DataFrame(channel_statistics)

In [6]:
channel_data

Unnamed: 0,Channel_name,Channel_id,Subscribers,Views,Total_videos,playlist_id
0,Marvel Entertainment,UCvC4D8onUfXzvjTOM-dBfEA,19500000,5101794220,7994,UUvC4D8onUfXzvjTOM-dBfEA
1,Disney Entertainment Studios,UChCUEwJrrG01_L7w4Bh1xqA,139000,17989429,2983,UUhCUEwJrrG01_L7w4Bh1xqA
2,Paramount Pictures,UCF9imwPMSGz4Vq1NiTWCC7g,2630000,1904015545,1475,UUF9imwPMSGz4Vq1NiTWCC7g
3,Walt Disney Studios,UCuaFvcY4MhZY3U43mMt1dYQ,4110000,1750018240,1328,UUuaFvcY4MhZY3U43mMt1dYQ
4,Warner Bros. Pictures,UCjmJDM5pRKbUlVIzDYYWb6g,10600000,6281670064,2578,UUjmJDM5pRKbUlVIzDYYWb6g
5,20th Century Studios,UC2-BeLxzUBSs0uSrmzWhJuQ,4470000,2673336863,2415,UU2-BeLxzUBSs0uSrmzWhJuQ
6,Universal Pictures,UCq0OueAsdxH6b8nyAspwViw,6550000,4263755397,1317,UUq0OueAsdxH6b8nyAspwViw
7,Focus Entertainment,UCGoLJHggXySEzxaEOaDZDbQ,173000,164856513,79,UUGoLJHggXySEzxaEOaDZDbQ
8,Orion Pictures,UCt4Xdx38tkV5_3IkOJAFw-Q,136000,329939596,113,UUt4Xdx38tkV5_3IkOJAFw-Q
9,Gravitas Ventures,UCO7rKYuE7EMHYFcwP-obqhg,93700,22813389,750,UUO7rKYuE7EMHYFcwP-obqhg


In [7]:
channel_data['Subscribers'] = pd.to_numeric(channel_data['Subscribers'])
channel_data['Views'] = pd.to_numeric(channel_data['Views'])
channel_data['Total_videos'] = pd.to_numeric(channel_data['Total_videos'])
#channel_data.dtypes

## Pulling video ids²

#### (2) Adaptation from the original code authored by Thoufiq and available in https://techtfq.com/video/python-project-to-scrape-youtube-using-youtube-data-api

In [8]:
def get_video_ids(youtube, playlist_id):
    
    request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId = playlist_id,
                maxResults = 50)
    response = request.execute()
    
    video_ids = []
    
    for i in range(len(response['items'])):
        video_ids.append(response['items'][i]['contentDetails']['videoId'])
        
    next_page_token = response.get('nextPageToken')
    more_pages = True
    
    while more_pages:
        if next_page_token is None:
            more_pages = False
        else:
            request = youtube.playlistItems().list(
                        part='contentDetails',
                        playlistId = playlist_id,
                        maxResults = 50,
                        pageToken = next_page_token)
            response = request.execute()
    
            for i in range(len(response['items'])):
                video_ids.append(response['items'][i]['contentDetails']['videoId'])
            
            next_page_token = response.get('nextPageToken')
        
    return video_ids

In [9]:
video_ids = []

for channel in channel_data['Channel_name']:
    playlist_id = channel_data.loc[channel_data['Channel_name']==channel, 'playlist_id'].iloc[0]
    list_id = get_video_ids(youtube, playlist_id)
    for id in list_id:
        video_ids.append(id)

## Pulling video details³

#### (3) Adaptation from the original code authored by Thoufiq and available in https://techtfq.com/video/python-project-to-scrape-youtube-using-youtube-data-api

In [10]:
def get_video_details(youtube, video_ids):
    all_video_stats = []
    
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
                    part='snippet,statistics',
                    id=','.join(video_ids[i:i+50]))
        response = request.execute()
        
        
        for i in range(len(response['items'])):
            if 'viewCount' not in response['items'][i]['statistics'] and 'likeCount' not in response['items'][i]['statistics'] and 'commentCount' not in response['items'][i]['statistics']:
                video_stats = dict(Title = response['items'][i]['snippet']['title'],
                                Published_date = response['items'][i]['snippet']['publishedAt'],
                                Views = None,
                                Likes = None,
                                Comments = None,
                                Channel_id = response['items'][i]['snippet']['channelId']
                                )
            elif 'viewCount' in response['items'][i]['statistics'] and 'likeCount' not in response['items'][i]['statistics'] and 'commentCount' not in response['items'][i]['statistics']:
                video_stats = dict(Title = response['items'][i]['snippet']['title'],
                                Published_date = response['items'][i]['snippet']['publishedAt'],
                                Views = response['items'][i]['statistics']['viewCount'],
                                Likes = None,
                                Comments = None,
                                Channel_id = response['items'][i]['snippet']['channelId']
                                )
            elif 'viewCount' not in response['items'][i]['statistics'] and 'likeCount' in response['items'][i]['statistics'] and 'commentCount' not in response['items'][i]['statistics']:
                video_stats = dict(Title = response['items'][i]['snippet']['title'],
                                Published_date = response['items'][i]['snippet']['publishedAt'],
                                Views = None,
                                Likes = response['items'][i]['statistics']['likeCount'],
                                Comments = None,
                                Channel_id = response['items'][i]['snippet']['channelId']
                                )
            elif 'viewCount' not in response['items'][i]['statistics'] and 'likeCount' not in response['items'][i]['statistics'] and 'commentCount' in response['items'][i]['statistics']:
                video_stats = dict(Title = response['items'][i]['snippet']['title'],
                                Published_date = response['items'][i]['snippet']['publishedAt'],
                                Views = None,
                                Likes = None,
                                Comments = response['items'][i]['statistics']['commentCount'],
                                Channel_id = response['items'][i]['snippet']['channelId']
                                )
            elif 'viewCount' in response['items'][i]['statistics'] and 'likeCount' in response['items'][i]['statistics'] and 'commentCount' not in response['items'][i]['statistics']:
                video_stats = dict(Title = response['items'][i]['snippet']['title'],
                                Published_date = response['items'][i]['snippet']['publishedAt'],
                                Views = response['items'][i]['statistics']['viewCount'],
                                Likes = response['items'][i]['statistics']['likeCount'],
                                Comments = None,
                                Channel_id = response['items'][i]['snippet']['channelId']
                                )
            elif 'viewCount' not in response['items'][i]['statistics'] and 'likeCount' in response['items'][i]['statistics'] and 'commentCount' in response['items'][i]['statistics']:
                video_stats = dict(Title = response['items'][i]['snippet']['title'],
                                Published_date = response['items'][i]['snippet']['publishedAt'],
                                Views = None,
                                Likes = response['items'][i]['statistics']['likeCount'],
                                Comments = response['items'][i]['statistics']['commentCount'],
                                Channel_id = response['items'][i]['snippet']['channelId']
                                )
            elif 'viewCount' in response['items'][i]['statistics'] and 'likeCount' not in response['items'][i]['statistics'] and 'commentCount' in response['items'][i]['statistics']:
                video_stats = dict(Title = response['items'][i]['snippet']['title'],
                                Published_date = response['items'][i]['snippet']['publishedAt'],
                                Views = response['items'][i]['statistics']['viewCount'],
                                Likes = None,
                                Comments = response['items'][i]['statistics']['commentCount'],
                                Channel_id = response['items'][i]['snippet']['channelId']
                                )
            else:
                video_stats = dict(Title = response['items'][i]['snippet']['title'],
                                Published_date = response['items'][i]['snippet']['publishedAt'],
                                Views = response['items'][i]['statistics']['viewCount'],
                                Likes = response['items'][i]['statistics']['likeCount'],
                                Comments = response['items'][i]['statistics']['commentCount'],
                                Channel_id = response['items'][i]['snippet']['channelId']
                                )
            all_video_stats.append(video_stats)
    
    return all_video_stats

In [11]:
# Applying function to get video details
video_details = get_video_details(youtube, video_ids)

## Formatting, filtering and saving the dataset created by web scrapping

In [12]:
video_data = pd.DataFrame(video_details)
video_data

Unnamed: 0,Title,Published_date,Views,Likes,Comments,Channel_id
0,Hulk | Marvel Studios’ Legends | Disney+,2022-08-10T21:00:22Z,1712126,76964,3490,UCvC4D8onUfXzvjTOM-dBfEA
1,How to Draw Storm with Alex Lins | Marvel LIVE,2022-08-10T20:33:04Z,25738,1475,29,UCvC4D8onUfXzvjTOM-dBfEA
2,Dance Off | Marvel Studios' I Am Groot | Disney+,2022-08-10T17:00:35Z,373931,19964,574,UCvC4D8onUfXzvjTOM-dBfEA
3,Meet Namor: Marvel's First Mutant!,2022-08-10T15:00:14Z,83413,7052,545,UCvC4D8onUfXzvjTOM-dBfEA
4,"What If ""Thwip"" Was Outlawed?!",2022-08-09T20:00:24Z,50815,3543,236,UCvC4D8onUfXzvjTOM-dBfEA
...,...,...,...,...,...,...
39527,The Girl with the Dragon Tattoo - Official Tra...,2012-10-04T23:06:13Z,46922,252,5,UCf5CjDJvsFvtVIhkfmKAwAA
39528,Hot Tub Time Machine Official Preview,2010-02-17T22:57:50Z,402161,1001,129,UCf5CjDJvsFvtVIhkfmKAwAA
39529,Hot Tub Time Machine Official Trailer,2009-12-17T18:52:47Z,477623,1249,131,UCf5CjDJvsFvtVIhkfmKAwAA
39530,Fame Music Video,2009-09-15T20:07:29Z,9456205,36111,1543,UCf5CjDJvsFvtVIhkfmKAwAA


In [13]:
# Converting data to datetime or numerical data, as needed
video_data['Published_date'] = pd.to_datetime(video_data['Published_date']).dt.date
video_data['Views'] = pd.to_numeric(video_data['Views'])
video_data['Likes'] = pd.to_numeric(video_data['Likes'])
video_data['Comments'] = pd.to_numeric(video_data['Comments'])
video_data

Unnamed: 0,Title,Published_date,Views,Likes,Comments,Channel_id
0,Hulk | Marvel Studios’ Legends | Disney+,2022-08-10,1712126.0,76964.0,3490.0,UCvC4D8onUfXzvjTOM-dBfEA
1,How to Draw Storm with Alex Lins | Marvel LIVE,2022-08-10,25738.0,1475.0,29.0,UCvC4D8onUfXzvjTOM-dBfEA
2,Dance Off | Marvel Studios' I Am Groot | Disney+,2022-08-10,373931.0,19964.0,574.0,UCvC4D8onUfXzvjTOM-dBfEA
3,Meet Namor: Marvel's First Mutant!,2022-08-10,83413.0,7052.0,545.0,UCvC4D8onUfXzvjTOM-dBfEA
4,"What If ""Thwip"" Was Outlawed?!",2022-08-09,50815.0,3543.0,236.0,UCvC4D8onUfXzvjTOM-dBfEA
...,...,...,...,...,...,...
39527,The Girl with the Dragon Tattoo - Official Tra...,2012-10-04,46922.0,252.0,5.0,UCf5CjDJvsFvtVIhkfmKAwAA
39528,Hot Tub Time Machine Official Preview,2010-02-17,402161.0,1001.0,129.0,UCf5CjDJvsFvtVIhkfmKAwAA
39529,Hot Tub Time Machine Official Trailer,2009-12-17,477623.0,1249.0,131.0,UCf5CjDJvsFvtVIhkfmKAwAA
39530,Fame Music Video,2009-09-15,9456205.0,36111.0,1543.0,UCf5CjDJvsFvtVIhkfmKAwAA


In [14]:
video_data = pd.merge(video_data, channel_data.loc[:,"Channel_name":"Channel_id"], how='left', on='Channel_id')
video_data

Unnamed: 0,Title,Published_date,Views,Likes,Comments,Channel_id,Channel_name
0,Hulk | Marvel Studios’ Legends | Disney+,2022-08-10,1712126.0,76964.0,3490.0,UCvC4D8onUfXzvjTOM-dBfEA,Marvel Entertainment
1,How to Draw Storm with Alex Lins | Marvel LIVE,2022-08-10,25738.0,1475.0,29.0,UCvC4D8onUfXzvjTOM-dBfEA,Marvel Entertainment
2,Dance Off | Marvel Studios' I Am Groot | Disney+,2022-08-10,373931.0,19964.0,574.0,UCvC4D8onUfXzvjTOM-dBfEA,Marvel Entertainment
3,Meet Namor: Marvel's First Mutant!,2022-08-10,83413.0,7052.0,545.0,UCvC4D8onUfXzvjTOM-dBfEA,Marvel Entertainment
4,"What If ""Thwip"" Was Outlawed?!",2022-08-09,50815.0,3543.0,236.0,UCvC4D8onUfXzvjTOM-dBfEA,Marvel Entertainment
...,...,...,...,...,...,...,...
39527,The Girl with the Dragon Tattoo - Official Tra...,2012-10-04,46922.0,252.0,5.0,UCf5CjDJvsFvtVIhkfmKAwAA,MGM
39528,Hot Tub Time Machine Official Preview,2010-02-17,402161.0,1001.0,129.0,UCf5CjDJvsFvtVIhkfmKAwAA,MGM
39529,Hot Tub Time Machine Official Trailer,2009-12-17,477623.0,1249.0,131.0,UCf5CjDJvsFvtVIhkfmKAwAA,MGM
39530,Fame Music Video,2009-09-15,9456205.0,36111.0,1543.0,UCf5CjDJvsFvtVIhkfmKAwAA,MGM


In [16]:
#Filter to keep only official movie trailers/teasers
video_data = video_data[(video_data["Title"].str.contains("trailer"))| \
                        (video_data["Title"].str.contains("Trailer"))| \
                        (video_data["Title"].str.contains("teaser"))| \
                        (video_data["Title"].str.contains("Teaser"))]

In [17]:
video_data

Unnamed: 0,Title,Published_date,Views,Likes,Comments,Channel_id,Channel_name
12,Marvel Studios’ Assembled: The Making of Ms. M...,2022-08-03,356975.0,14835.0,621.0,UCvC4D8onUfXzvjTOM-dBfEA,Marvel Entertainment
21,ALL-OUT AVENGERS #1 Trailer | Marvel Comics,2022-07-26,100369.0,4796.0,461.0,UCvC4D8onUfXzvjTOM-dBfEA,Marvel Entertainment
35,Marvel Studios’ Black Panther: Wakanda Forever...,2022-07-24,35367133.0,1409575.0,62712.0,UCvC4D8onUfXzvjTOM-dBfEA,Marvel Entertainment
36,Official Trailer | She-Hulk: Attorney at Law |...,2022-07-24,21715675.0,580781.0,36385.0,UCvC4D8onUfXzvjTOM-dBfEA,Marvel Entertainment
53,I Am Groot | Official Trailer | Disney+,2022-07-22,17876830.0,685331.0,17958.0,UCvC4D8onUfXzvjTOM-dBfEA,Marvel Entertainment
...,...,...,...,...,...,...,...
39525,Hope Springs - Official Trailer,2012-10-04,15928.0,108.0,6.0,UCf5CjDJvsFvtVIhkfmKAwAA,MGM
39526,21 Jump Street - Official Trailer,2012-10-04,45674.0,332.0,2.0,UCf5CjDJvsFvtVIhkfmKAwAA,MGM
39527,The Girl with the Dragon Tattoo - Official Tra...,2012-10-04,46922.0,252.0,5.0,UCf5CjDJvsFvtVIhkfmKAwAA,MGM
39529,Hot Tub Time Machine Official Trailer,2009-12-17,477623.0,1249.0,131.0,UCf5CjDJvsFvtVIhkfmKAwAA,MGM


In [18]:
# Saving the new dataset in a file
#video_data.to_csv("videos_csv.csv", encoding='utf-8', index=False)
#video_data.to_csv("videos_tsv.tsv", encoding='utf-8', index=False, sep="\t")
video_data.to_excel('videos_excel.xlsx', index=False)

# End