<h1><center>Playing with Youtube Data</center></h1>

## Importing Packages

In [2]:
import json
import requests
import pandas as pd
import numpy as np
import isodate

## Defining an api key
(API key is needed to use youtube data api, can be generated for free @ https://developers.google.com/youtube/v3/getting-started)

In [3]:
api_key = 'AIzaSyDAf_HLLH7VTpXZVHwharPFQWlSeTOcTz0'

## Defining wrapping functions to extract Youtube Video data

In [4]:
def retrieve_uploads_id(username):
    """
    Takes in a youtube channel name (eg. nigahiga) and returns a youtube playlist id associated with all uploads 
    args: channel name
    output: playlist id
    """
    url = 'https://www.googleapis.com/youtube/v3/channels'
    parameters= {'part': 'contentDetails', 
    
                 'key': api_key, 
                 'forUsername' : username}
    r = requests.request(method = 'get', url = url, params = parameters)
    results = r.json()
    playlist_id = results['items'][0]['contentDetails']['relatedPlaylists']['uploads']
    return playlist_id

def retrieve_playlist_video_ids(playlist_id, num = 100000):
    """
    Takes in a playlist id and return a list of all video ids in the playlist
    args: playlist_id, num (number of videos you want to retrieve)
    output: list of all video ids
    """
    url = 'https://www.googleapis.com/youtube/v3/playlistItems'
    parameters = {'part': 'snippet, contentDetails',
               'fields' : 'nextPageToken, items(contentDetails(videoId))',
              'playlistId': playlist_id,
              'key': api_key,
               'pageToken': '',
               'maxResults': 50
               }
    ls = []
    num_loops = 0
    while parameters['pageToken'] != 'None':
        num_loops += 1
        if num_loops > num/50:
            break
        r = requests.get(url = url, params = parameters)
        results = r.json()
        if 'nextPageToken' in results.keys():
            parameters['pageToken'] = results['nextPageToken']
        else:
            parameters['pageToken'] = 'None'
        for item in results['items']:
            video_id = item['contentDetails']['videoId']
            ls.append(video_id)
    return ls

def retrieve_playlist(video_ids): 
    """
    Takes in a list of video_ids and return a dataframe consisting of 11 features associated with the video_id for the first 
    50 videos
    max results = 50 due to restriction from the id parameter
    """
    url = 'https://www.googleapis.com/youtube/v3/videos'
    video_ids_string = ','.join(video_ids)
    parameters = {'part': 'snippet, statistics, contentDetails, topicDetails',
                 'id': video_ids_string,
                  'key': api_key}
    main_list = []
    r = requests.request(method = 'get', url = url, params = parameters)
    results = r.json()
    for item in results['items']:
        ls = []
        ls.append(item['snippet']['title'])
        ls.append(item['snippet']['description'])
        ls.append(item['snippet']['publishedAt'])
        ls.append(item['snippet']['channelTitle'])
        if 'tags' in item['snippet'].keys():
            ls.append(len(item['snippet']['tags']))
        else:
            ls.append('None')
        ls.append(item['contentDetails']['duration'])
        if 'viewCount' in item['statistics'].keys():
            ls.append(item['statistics']['viewCount'])
        else:
            ls.append('None')
        if 'likeCount' in item['statistics'].keys():
            ls.append(item['statistics']['likeCount'])
        else:
            ls.append('None')
        if 'dislikeCount' in item['statistics'].keys():
            ls.append(item['statistics']['dislikeCount'])
        else:
            ls.append('None')
        if 'commentCount' in item['statistics'].keys():
            ls.append(item['statistics']['commentCount'])
        else:
            ls.append('None')
        ls2 = []
        if 'topicDetails' in item.keys() and 'topicCategories' in item['topicDetails'].keys():
            topics = item['topicDetails']['topicCategories']
            for topic in topics:
                topic = topic.split('/')
                ls2.append(topic[-1])
            topics = ';'.join(ls2)
            ls.append(topics)
        else:
            ls.append('None')
        main_list.append(ls)
    df = pd.DataFrame(main_list)
    df.columns = ['title', 'description', 'date', 'channel', 'num_tags', 'duration', 'viewcount', 'likecount', 'dislikecount', 'commentcount', 'topics']
    return df

def retrieve_all_videos(video_ids, num):
    """
    takes in a list of video_ids and an integer num (number of sets of 50 videos) and churns out a dataframe of 11 features
    """
    df = pd.DataFrame()
    chunk = [video_ids[x:x+50] for x in range(0, len(video_ids),50)]
    for i in range(num):   
            df = df.append(retrieve_playlist(chunk[i]))
    return df

def retrieve_all_from_channel(username):
    """
    Combines all above functions, takes in a username and churns out a dataframe of 11 features
    """
    playlist_id = retrieve_uploads_id(username)
    video_ids = retrieve_playlist_video_ids(playlist_id)
    num = len(video_ids)
    num = int(num/50) + 1
    df = retrieve_all_videos(video_ids, num)
    return df

    
def retrieve_most_popular_eng_videos(region_code):
    """
    Retreive the most popular english videos by region code and churns out a dataframe consisting of 11 features
    """
    url = 'https://www.googleapis.com/youtube/v3/videos'
    parameters = {'part': 'snippet, statistics, contentDetails, topicDetails', 
                  'chart': 'mostPopular',
                 'regionCode': region_code,
                 'pageToken': '',
                  'maxResults': 50,
                 'key': api_key}
    df = []
    while parameters['pageToken'] != 'None':   
        r = requests.request(method = 'get', url = url, params = parameters)
        results = r.json()
        if 'nextPageToken' in results.keys():
            parameters['pageToken'] = results['nextPageToken']
        else:
            parameters['pageToken'] = 'None'
        for item in results['items']:
            try:
                if item['snippet']['defaultAudioLanguage'] == 'en':
                    ls = []
                    ls.append(item['snippet']['title'])
                    ls.append(item['snippet']['description'])
                    ls.append(item['snippet']['publishedAt'])
                    ls.append(item['snippet']['channelTitle'])
                    if 'tags' in item['snippet'].keys():
                        ls.append(len(item['snippet']['tags']))
                    else:
                        ls.append('None')
                    ls.append(item['contentDetails']['duration'])
                    if 'viewCount' in item['statistics'].keys():
                        ls.append(item['statistics']['viewCount'])
                    else:
                        ls.append('None')
                    if 'likeCount' in item['statistics'].keys():
                        ls.append(item['statistics']['likeCount'])
                    else:
                        ls.append('None')
                    if 'dislikeCount' in item['statistics'].keys():
                        ls.append(item['statistics']['dislikeCount'])
                    else:
                        ls.append(0)
                    if 'commentCount' in item['statistics'].keys():
                        ls.append(item['statistics']['commentCount'])
                    else:
                        ls.append('None')
                    ls2 = []
                    if 'topicDetails' in item.keys() and 'topicCategories' in item['topicDetails'].keys():
                        topics = item['topicDetails']['topicCategories']
                        for topic in topics:
                            topic = topic.split('/')
                            ls2.append(topic[-1])
                        topics = ';'.join(ls2)
                        ls.append(topics)
                    else:
                        ls.append('None')
                    df.append(ls)
            except KeyError:
                continue
    df = pd.DataFrame(df)
    df.columns = ['title', 'description', 'date', 'channel', 'num_tags', 'duration', 'viewcount', 'likecount', 'dislikecount', 'commentcount', 'topics']
    return df

## Extract a few government ministries' youtube data to play with!

In [5]:
df_mindef = retrieve_all_from_channel('cyberpioneertv')
df_moe = retrieve_all_from_channel('MOESpore')
df_moh = retrieve_all_from_channel('MOHSingapore')
df_mha = retrieve_all_from_channel('hometeamnews')
df_final = pd.concat([df_mindef, df_moe, df_moh, df_mha])
df_final.to_csv('df_final.csv')

In [9]:
#Notice that the date, duration and topics columns needs to be sorted out due to its weird format
# We can also create new features such as title length, description length, ratio of likes to dislikes etc.
df_final.head()

Unnamed: 0,title,description,date,channel,num_tags,duration,viewcount,likecount,dislikecount,commentcount,topics
0,A Journey Like No Other: Episode 6 - Eyes on o...,"""Everytime we sail, we are protecting Singapor...",2018-12-29T10:01:18.000Z,Ministry of Defence Singapore,12,PT3M39S,2558,43,3,0,Military;Society
1,The SAF Band Takes Japan! (54th JSDF Marching ...,The SAF Band takes to the international stage ...,2019-01-04T11:32:48.000Z,Ministry of Defence Singapore,10,PT1M49S,1268,24,0,3,Entertainment
2,"Total Defence Awards 2018: With You, We Can","National Service (NS) is not just about us, bu...",2018-12-22T10:00:03.000Z,Ministry of Defence Singapore,16,PT4M44S,2836,41,5,2,Military;Society
3,111/18 Officer Cadet Course Commissioning Para...,What is it like to be a Midshipman Commanding ...,2018-12-17T10:38:31.000Z,Ministry of Defence Singapore,18,PT2M9S,4383,38,10,3,Military;Society
4,"Behind the Scenes of Royston Tan's ""36 Ways to...","Be it drinking your morning Kopi, hearing bird...",2018-11-28T03:58:23.000Z,Ministry of Defence Singapore,18,PT1M23S,1008,8,1,2,Film;Entertainment


## Importing Bokeh, matplotlib, ipywidgets for data viz

In [7]:
import matplotlib.pyplot as plt
from bokeh.plotting import figure, show
from bokeh.io import output_notebook, export_png, output_file,save
from bokeh.models import ColumnDataSource, CategoricalColorMapper, Legend, HoverTool
from bokeh.layouts import row, column, gridplot
from ipywidgets import interact
import ipywidgets as widgets

## Sorting out problematic columns

In [10]:
df = pd.read_csv('df_final.csv', parse_dates = True, index_col = ['date'])
df.drop(labels = ['Unnamed: 0'], axis = 1, inplace = True)
df['duration'] = df['duration'].apply(isodate.parse_duration) 
df.replace('None', np.NaN, inplace = True)
ls = list(df.columns[4:-1])
ls.remove('duration')
for variable in ls:
    df[variable] = pd.to_numeric(df[variable], errors = 'coerce')
df['title_length'] = df['title'].apply(lambda x: int(len(x.split(' '))) if (not pd.isna(x)) else x)
df['description_length'] = df['description'].apply(lambda x: len(x.split(' ')) if (not pd.isna(x)) else x)
df['total_likes'] = df['likecount']+df['dislikecount']
df['adjusted_likes'] = df['likecount']/df['total_likes'] * df['likecount']
df['seconds'] = df['duration'].apply(lambda x: x.total_seconds())
df_mindef = df[df['channel'] == 'Ministry of Defence Singapore']
df_moe = df[df['channel'] == 'MOE Singapore']
df_moh = df[df['channel'] == 'MOHSingapore']
df_mha = df[df['channel'] == 'MHA Singapore']
df.to_csv('youtubedata_cleaned.csv')

## Creating an interactive timeplot of various features in the youtube data

In [21]:
def plot(variable, year1 = None, year2 = None):
    hover = HoverTool(tooltips = [('title','@title'), ('%likes','@adjusted_likes'), ('views', '@viewcount')])
    year1 = str(year1)
    year2 = str(year2)
    source1 = ColumnDataSource(df_mindef[year1:year2])
    p1 = figure(x_axis_label = 'time', y_axis_label = variable, x_axis_type = 'datetime', plot_width = 400, plot_height = 400)
    p1.circle('date', variable , source = source1, color = 'green', legend = 'mindef')
    p1.legend.location= 'top_left'
    p1.add_tools(hover)

    source2 = ColumnDataSource(df_moe[year1: year2])
    p2 = figure(x_axis_label = 'time', y_axis_label = variable, x_axis_type = 'datetime', plot_width = 400, plot_height = 400)
    p2.circle('date', variable, source = source2, color = 'red', legend = 'moe') 
    p2.legend.location = 'top_left'
    p2.add_tools(hover)

    source3 = ColumnDataSource(df_moh[year1:year2])
    p3 = figure(x_axis_label = 'time', y_axis_label = variable, x_axis_type = 'datetime', plot_width = 400, plot_height = 400)
    p3.circle('date', variable, source = source3, color = 'orange', legend = 'moh') 
    p3.legend.location = 'top_left'
    p3.add_tools(hover)

    source4 = ColumnDataSource(df_mha[year1:year2])
    p4 = figure(x_axis_label = 'time', y_axis_label = variable, x_axis_type = 'datetime', plot_width = 400, plot_height = 400)
    p4.circle('date', variable, source = source4, color = 'blue', legend = 'mha') 
    p4.legend.location = 'top_left'
    p4.add_tools(hover)

    p2.x_range = p1.x_range
    p3.x_range = p1.x_range
    p4.x_range = p1.x_range
    p1.y_range = p2.y_range
    p1.y_range = p2.y_range
    p3.y_range = p2.y_range
    p4.y_range = p2.y_range
    layout = column(gridplot(children = [[p1,p2],[p3,p4]]))
    show(layout)

variable_ls = list(df.columns)
variable_ls.remove('topics') 
variable_ls = variable_ls[3:]

In [23]:
output_notebook()
interact(plot, year1=widgets.IntSlider(min=2009,max=2018,step=1,value=2018), year2=widgets.IntSlider(min=2009,max=2018,step=1,value=2018),\
        variable = variable_ls)

interactive(children=(Dropdown(description='variable', options=('num_tags', 'duration', 'viewcount', 'likecoun…

<function __main__.plot(variable, year1=None, year2=None)>

# Feel free to play with the years and variables to compare the 4 ministries!!