In [1]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
# from google_auth_oauthlib.tools import argparser
import pandas as pd
from urllib.parse import urlparse, parse_qs
import isodate

In [14]:
# load data
urls1 = pd.read_csv('video_urls1561574859.551679.csv')
urls2 = pd.read_csv('faculty_vid_urls.csv').dropna()

yt_crit1 = urls1['url'].map(lambda x: "youtube" in x)
users1 = urls1['url'].map(lambda x: "user" in x or "channel" in x or "/c/" in x)

yt_crit2 = urls2['url'].map(lambda x: "youtube" in x)
users2 = urls2['url'].map(lambda x: "user" in x or "channel" in x or "/c/" in x)

data1 = urls1[yt_crit1 & ~users1]

data2 = urls2[yt_crit2 & ~users2]

data = pd.concat([data1,data2])

In [19]:
data = data.drop_duplicates(subset="url")
data.shape

(3864, 4)

In [2]:
# extract video id from url
def video_id(value):
    """
    Examples:
    - http://youtu.be/SA2iWivDJiE
    - http://www.youtube.com/watch?v=_oPAwA_Udwc&feature=feedu
    - http://www.youtube.com/embed/SA2iWivDJiE
    - http://www.youtube.com/v/SA2iWivDJiE?version=3&amp;hl=en_US
    """
    url_data = urlparse(value)
    query = parse_qs(url_data.query)
    
    if url_data.hostname == 'youtu.be':
        ytid = url_data.path[1:]
    elif "v" in query.keys():
        ytid = query["v"][0]
    elif 'embed' in url_data.path or '/v/' in url_data.path:
        ytid = url_data.path.split('/')[2]
    else:
        ytid = ''
    return ytid

# extract playlist id from url
def playlist_id(value):
    url_data = urlparse(value)
    query = parse_qs(url_data.query)
    if "list" in query.keys():
        plid = query['list'][0]
    elif "view_play_list" in query.keys():
        plid = query["view_play_list"][0]
    else:
        plid = ""
    return plid

In [25]:
data['video_id'] = [video_id(i) for i in data.url]
data['playlist_id'] = [playlist_id(i) for i in data.url]

In [8]:
pl_data = data[data.playlist_id != ""]
reg_data = data[data.playlist_id == ""]

In [9]:
reg_data = reg_data[reg_data.video_id != ""]

In [10]:
entities = reg_data.name.unique()

In [1]:
stats = youtube.videos().list(part='statistics',id="-3y1E32gUZ8").execute()

In [3]:
def paginate_list(resource, **kwargs):
    """
    Scans through all pages and joins records into a single list
    resource: service resource, e.g. something like service.activity()
    """
    kwargs['maxResults'] = 50
    page = resource.list(**kwargs).execute()
    items = page['items']
    while 'nextPageToken' in page:
        kwargs['pageToken'] = page['nextPageToken']
        page = resource.list(**kwargs).execute()
        items.extend(page['items'])
    return items

def get_playlist_items_for_upload_id(yt, uploadId):
    """
    Get playlist item metadata for upload_id
    """
    kwargs = dict(
        playlistId=uploadId,
        part='snippet',
    )
    return paginate_list(yt.playlistItems(),**kwargs)

def get_video_ids_for_playlist_items(playlistItems):
    """
    playlistItems is a list of playlistItem records
    """
    return [item['snippet']['resourceId']['videoId'] for item in playlistItems]

In [136]:
def get_video_metadata(df,entities,api_key,start = 0):
    
    YOUTUBE_API_SERVICE_NAME = "youtube"
    YOUTUBE_API_VERSION = "v3"

    youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,developerKey=api_key)
    
#     entities = df.name.unique()
    video_dict = {}
    
    for name in entities[start:]:
        pubdate = []
        duration = []
        has_captions = []
        viewCount = []
        channelTitle = []
        videoTitle = []

        print(name)
        test = df[df.name==name].copy()
        for vid in test.video_id:
            try:
                stats = youtube.videos().list(part='statistics,contentDetails,snippet',id=vid).execute()
            except HttpError as e:
                print(e)
                return video_dict
            if len(stats['items'])>0:
                pubdate.append(stats['items'][0]['snippet']['publishedAt'] \
                if 'publishedAt' in stats['items'][0]['snippet'].keys() else "")

                duration.append(isodate.parse_duration(stats['items'][0]['contentDetails']['duration']).seconds \
                if 'duration' in stats['items'][0]['contentDetails'].keys() else "")

                has_captions.append(stats['items'][0]['contentDetails']['caption'] \
                if 'caption' in stats['items'][0]['contentDetails'].keys() else "")

                viewCount.append(stats['items'][0]['statistics']['viewCount'] \
                if 'viewCount' in stats['items'][0]['statistics'].keys() else "")

                channelTitle.append(stats['items'][0]['snippet']['channelTitle'] \
                if 'channelTitle' in stats['items'][0]['snippet'].keys() else "")

                videoTitle.append(stats['items'][0]['snippet']['title'] \
                if 'title' in stats['items'][0]['snippet'].keys() else "")

        test['video_title']=videoTitle
        test['channel_title']=channelTitle
        test['pubdate']=pubdate
        test['duration']=duration
        test['has_captions']=has_captions
        test['view_count']=viewCount

        video_dict[name]=test
        print("done with",name)
        
    return video_dict

In [64]:
# vids_from_pls