# Scraping YouTube API

Notebook for Scraping YT API for youtube video information. Includes functions for writing information to a csv

### Helper Functions

In [120]:
from isodate import parse_duration

# test function - used to parse duration
duration_string = "PT1H30M"  # this represents 1 hour and 30 minutes
duration = parse_duration(duration_string)
print(duration.total_seconds()) 

5400.0


In [121]:
# dictionary with all YT categories
category_mapping = {
    '1': 'Film & Animation',
    '2': 'Autos & Vehicles',
    '10': 'Music',
    '15': 'Pets & Animals',
    '17': 'Sports',
    '18': 'Short Movies',
    '19': 'Travel & Events',
    '20': 'Gaming',
    '21': 'Videoblogging',
    '22': 'People & Blogs',
    '23': 'Comedy',
    '24': 'Entertainment',
    '25': 'News & Politics',
    '26': 'Howto & Style',
    '27': 'Education',
    '28': 'Science & Technology',
    '29': 'Nonprofits & Activism',
    '30': 'Movies',
    '31': 'Anime/Animation',
    '32': 'Action/Adventure',
    '33': 'Classics',
    '34': 'Comedy',
    '35': 'Documentary',
    '36': 'Drama',
    '37': 'Family',
    '38': 'Foreign',
    '39': 'Horror',
    '40': 'Sci-Fi/Fantasy',
    '41': 'Thriller',
    '42': 'Shorts',
    '43': 'Shows',
    '44': 'Trailers'
}

def get_category_name(category_id):
    return category_mapping.get(category_id, 'Unknown')


### Scraping Functions

In [122]:
import csv
from googleapiclient.discovery import build
from datetime import datetime

# Set up the YouTube API
api_key = 'AIzaSyBj4eSpELFFPwfVgXG1I5Lax5HSQYvH1Xg'  # can replace as necessary
youtube = build('youtube', 'v3', developerKey=api_key)

def search_videos(query, max_results=100):  # default max result to 100
    """ Takes query and max_results, returns list of YT videos
    uses next_page_token to paginate and bypass default maximum of 50 videos"""
    videos = []
    next_page_token = None

    while len(videos) < max_results:
        request = youtube.search().list(
            q=query,
            part='snippet',
            type='video',
            maxResults=min(50, max_results - len(videos)),  # Limit results per page
            pageToken=next_page_token
        )
        response = request.execute()

        video_ids = [item['id']['videoId'] for item in response['items']]
        video_data = [get_video_data(video_id) for video_id in video_ids]
        videos.extend(video_data)

        next_page_token = response.get('nextPageToken')

        if not next_page_token:
            break

    print("Number of Videos Collected: ", len(videos))
    return videos



In [123]:
def get_video_data(video_id):
    """ Takes video id and gets all relevant video data """
    response = youtube.videos().list(
        part='snippet,statistics,contentDetails',
        id=video_id
    ).execute()

    #print("fetching video data...")
    video_data = response['items'][0]
    snippet = video_data['snippet']
    statistics = video_data['statistics']
    content_details = video_data['contentDetails']

    title = snippet['title']
    channel_title = snippet['channelTitle']
    channel_id = snippet['channelId']
    published_at_str = snippet['publishedAt']
    published_at = datetime.strptime(published_at_str, "%Y-%m-%dT%H:%M:%SZ")
    published_date = published_at.date()
    published_time = published_at.time()
    duration = parse_duration(content_details['duration']).total_seconds()
    view_count = statistics['viewCount']
    like_count = statistics.get('likeCount', 0)
    comment_count = statistics.get('commentCount', 0)
    category_id = get_category_name(snippet['categoryId'])
    
    # fetch channel data
    #print("fetching channel data...")
    channel_id = snippet['channelId']
    channel_data = get_channel_data(channel_id)

    return {
        'title': title,
        'video_id': video_id,
        'channel_title': channel_title,
        'channel_id': channel_id,
        'published_date': published_date,
        'published_time': published_time,
        'duration': duration,
        'view_count': view_count,
        'like_count': like_count,
        'comment_count': comment_count,
        'category_id': category_id,
        'channel_subscriber_count': channel_data['channel_subscriber_count'],
        'channel_view_count': channel_data['channel_view_count'],
        'channel_video_count': channel_data['channel_video_count'],
        'channel_country': channel_data['channel_country']
    }

# Function to get channel data
def get_channel_data(channel_id):
    """ helper function called within get_video_data() to get relevant channel data """
    response = youtube.channels().list(
        part='snippet,statistics',
        id=channel_id
    ).execute()

    channel_data = response['items'][0]
    snippet = channel_data['snippet']
    statistics = channel_data['statistics']

    channel_title = snippet['title']
    channel_subscriber_count = statistics.get('subscriberCount', 0)
    channel_view_count = statistics.get('viewCount', 0)
    channel_video_count = statistics.get('videoCount', 0)
    channel_country = snippet.get('country', '')

    return {
        'channel_title': channel_title,
        'channel_subscriber_count': channel_subscriber_count,
        'channel_view_count': channel_view_count,
        'channel_video_count': channel_video_count,
        'channel_country': channel_country
    }



### Collecting Data

In [124]:
query = 'Wellesley College'
videos = search_videos(query, max_results=100)

Number of Videos Collected:  100


In [127]:
import pandas as pd

df = pd.DataFrame(videos)
df.head()

Unnamed: 0,title,video_id,channel_title,channel_id,published_date,published_time,duration,view_count,like_count,comment_count,category_id,channel_subscriber_count,channel_view_count,channel_video_count,channel_country
0,Wellesley College Campus Tour,p9K0MVE8DeI,WellesleyCollege,UCHIZCvocr0Doe2RLjDmtJwg,2020-07-27,23:29:22,538.0,39210,550,27,Education,85900,12390230,599,US
1,MY HONEST OPINIONS ABOUT WELLESLEY COLLEGE | W...,nvUFzipv1Jg,Shayla Zamora,UCMUcpu7zcHJIbGgjZ5il1Xg,2021-03-23,22:00:19,635.0,17025,324,67,People & Blogs,1340,47795,15,US
2,The Power of Place: An Aerial Tour of Wellesle...,sHcRmaWGVK8,WellesleyCollege,UCHIZCvocr0Doe2RLjDmtJwg,2021-04-13,21:06:21,83.0,19600,233,3,Education,85900,12390230,599,US
3,Campus Feel (it's like a warm hug!),JaxuU6gwv5o,WellesleyCollege,UCHIZCvocr0Doe2RLjDmtJwg,2023-04-21,17:40:20,127.0,3505,83,4,Education,85900,12390230,599,US
4,Wellesley College under federal investigation ...,Vt_K9ogjNks,CBS Boston,UCi4fcBVyo4CAnmdgXeO-NvA,2023-11-17,23:35:51,25.0,1088,13,7,News & Politics,216000,215754871,70846,US


In [129]:
df_clean = df.drop(columns=['video_id', 'channel_id'])
df_clean.head()

Unnamed: 0,title,channel_title,published_date,published_time,duration,view_count,like_count,comment_count,category_id,channel_subscriber_count,channel_view_count,channel_video_count,channel_country
0,Wellesley College Campus Tour,WellesleyCollege,2020-07-27,23:29:22,538.0,39210,550,27,Education,85900,12390230,599,US
1,MY HONEST OPINIONS ABOUT WELLESLEY COLLEGE | W...,Shayla Zamora,2021-03-23,22:00:19,635.0,17025,324,67,People & Blogs,1340,47795,15,US
2,The Power of Place: An Aerial Tour of Wellesle...,WellesleyCollege,2021-04-13,21:06:21,83.0,19600,233,3,Education,85900,12390230,599,US
3,Campus Feel (it's like a warm hug!),WellesleyCollege,2023-04-21,17:40:20,127.0,3505,83,4,Education,85900,12390230,599,US
4,Wellesley College under federal investigation ...,CBS Boston,2023-11-17,23:35:51,25.0,1088,13,7,News & Politics,216000,215754871,70846,US


In [130]:
# write to csv 
df_clean.to_csv('youtube_data_test.csv')