In [132]:
import requests
import random
import pandas as pd
import re
import json
import datetime as dt
import time
from bs4 import BeautifulSoup
from headers import headers_list
from loading_bar import log_progress
from data_skills import DATA_SKILLS, SKILL_DICT
from skill_extraction import extract_skills, extract_ignore, extract_data_skills

In [95]:
base_url = 'https://www.youtube.com'
# Dictionary for filtering search query
sp_dict = {'this_year': 'EgQIBRAB', 'this_month': 'EgQIBBAB', 'this_week': 'EgQIAxAB', 'today': 'EgQIAhAB'}
url = base_url + '/results'

In [196]:
query = 'learn ' + skill
params = {'search_query': query.replace(' ', '+'), 'sp': sp_dict['this_year']}
page = requests.get(url, params=params, headers=random.choice(headers_list))
page

<Response [200]>

In [202]:
soup = BeautifulSoup(page.content, 'html.parser')
json_text = str(soup.find_all('script')).split('var ytInitialData = ')[-1].split(';</script>')[0]
res = json.loads(json_text)
res = res['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents']

In [122]:
def get_text(content, info):
    try:
        return ' '.join(t['text'] for t in content[info]['runs'])
    except:
        return None

def get_length(content):
    try:
        length = content['lengthText']['simpleText']
        length = length.split(':')
        if len(length) == 1:
            length.insert(0, '00')
        if len(length) == 2:
            length.insert(0, '00')
        return ':'.join(length)
    except:
        return None

def get_published_date(content):
    try:
        published_time = content['publishedTimeText']['simpleText']
        val = [int(s) for s in published_time.split() if s.isdigit()][0]
        current = dt.datetime.now()
        if 'year' in published_time:
            published = current - dt.timedelta(days=365.25*val)
        elif 'month' in published_time:
            published = current - dt.timedelta(days=30.436875*val)
        elif 'week' in published_time:
            published = current - dt.timedelta(weeks=val)
        elif 'day' in published_time:
            published = current - dt.timedelta(days=val)
        elif 'hour' in published_time:
            published = current - dt.timedelta(hours=val)
        elif 'minute' in published_time:
            published = current - dt.timedelta(minutes=val)
        elif 'second' in published_time:
            published = current - dt.timedelta(seconds=val)
        return published.year, published.month
    except:
        return None, None

def get_view_count(content):
    try:
        view_count = content['viewCountText']['simpleText']
        view_count = view_count.split(' views')[0].replace(',', '')
        return int(view_count)
    except:
        return None

def get_url(content):
    try:
        url = content['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url']
        return base_url + url
    except:
        return None

def get_description(content):
    try:
        description = ' '.join([t['text'] for t in content['detailedMetadataSnippets'][0]['snippetText']['runs']])
        return description
    except:
        return None

def get_skills(title, description):
    context = title
    if description is not None:
        context = context + ' ' + description
    all_skills = extract_skills(context)
    # Ignore the Video skill as it is not relevant for Youtube
    if 'Video' in all_skills:
        all_skills.remove('Video')
    keep_skills, _ = extract_ignore(all_skills)
    keep_skills.sort()
    if len(keep_skills) > 0:
        data_skills = extract_data_skills(keep_skills)
        if len(data_skills) > 0:
            return '; '.join(keep_skills), '; '.join(data_skills)
        return '; '.join(keep_skills), None
    return None, None

In [203]:
video_list = []
# Iterate through each video
for contents in res:
    # Get only those with video
    if 'itemSectionRenderer' not in contents:
        continue
    contents = contents['itemSectionRenderer']['contents']
    for content in contents:
        # Ignore ads
        if 'videoRenderer' not in content:
            continue
        content = content['videoRenderer']
        title = get_text(content, 'title')
        if title is None:
            continue
        description = get_description(content)
        skills, data_skills = get_skills(title, description)
        published_year, published_month = get_published_date(content)
        video_list.append({
            'id': content['videoId'],
            'title': title,
            'channel': get_text(content, 'ownerText'),
            'published_year': published_year,
            'published_month': published_month,
            'length': get_length(content),
            'view_count': get_view_count(content),
            'url': get_url(content),
            'description': description,
            'skills': skills,
            'data_skills': data_skills
        })
df = pd.DataFrame.from_dict(video_list)
df['length'] = pd.to_timedelta(df['length'])
df.head()

Unnamed: 0,id,title,channel,published_year,published_month,length,view_count,url,description,skills,data_skills
0,_F6a0ddbjtI,The KEY to Understanding Financial Statements,Accounting Stuff,2020,10,0 days 00:06:29,79615.0,https://www.youtube.com/watch?v=_F6a0ddbjtI,In this video I'll share the key to understand...,Balance Sheet; Financial Statement,Financial Statements
1,DLRF-eeFj1g,How To Read And Understand Financial Statement...,Bench Accounting,2021,5,0 days 00:11:08,11183.0,https://www.youtube.com/watch?v=DLRF-eeFj1g,There are three main types of financial state...,Balance Sheet; Financial Statement; Income Sta...,Financial Statements
2,DMv9JC_K37Y,How To Read & Analyze The Balance Sheet Like a...,The Financial Controller,2021,1,0 days 00:21:32,224726.0,https://www.youtube.com/watch?v=DMv9JC_K37Y,I walk you through both the accounting and the...,Accounting; Analysis; Balance Sheet,
3,CMv1zlZhb4Q,The BALANCE SHEET for BEGINNERS (Full Example),Accounting Stuff,2021,4,0 days 00:06:59,63212.0,https://www.youtube.com/watch?v=CMv1zlZhb4Q,"The ' Balance Sheet ', or 'Statement of Financ...",Balance Sheet; Financial Statement,Financial Statements
4,Q1m76iMIepU,IAS 1 Presentation of Financial Statements: Su...,Silvia of CPDbox,2020,10,0 days 00:12:08,39627.0,https://www.youtube.com/watch?v=Q1m76iMIepU,https://www.cpdbox.com/ This is just the short...,Financial Statement; Presentation,Financial Statements; Presentation Skill


In [204]:
def get_youtube_videos(skill, filter_time=None):
    base_url = 'https://www.youtube.com'
    # Dictionary for filtering search query
    sp_dict = {'this_year': 'EgQIBRAB', 'this_month': 'EgQIBBAB', 'this_week': 'EgQIAxAB', 'today': 'EgQIAhAB'}
    if filter_time not in sp_dict.keys():
        return None
    url = base_url + '/results'
    query = 'learn ' + skill
    params = {'search_query': query.replace(' ', '+')}
    # Default is no filter
    if filter_time is not None:
        params['sp'] = sp_dict[filter_time]
    page = requests.get(url, params=params, headers=random.choice(headers_list))
    if page.status_code != 200:
        print(page, page.reason)
        return None
    soup = BeautifulSoup(page.content, 'html.parser')
    json_text = str(soup.find_all('script')).split('var ytInitialData = ')[-1].split(';</script>')[0]
    res = json.loads(json_text)
    res = res['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents']
    video_list = []
    # Iterate through each video
    for contents in res:
        # Get only those with video
        if 'itemSectionRenderer' not in contents:
            continue
        contents = contents['itemSectionRenderer']['contents']
        for content in contents:
            # Ignore ads
            if 'videoRenderer' not in content:
                continue
            content = content['videoRenderer']
            title = get_text(content, 'title')
            if title is None:
                continue
            description = get_description(content)
            skills, data_skills = get_skills(title, description)
            published_year, published_month = get_published_date(content)
            video_list.append({
                'id': content['videoId'],
                'title': title,
                'channel': get_text(content, 'ownerText'),
                'published_year': published_year,
                'published_month': published_month,
                'length': get_length(content),
                'view_count': get_view_count(content),
                'url': get_url(content),
                'description': description,
                'skills': skills,
                'data_skills': data_skills
            })
    df = pd.DataFrame.from_dict(video_list) 
    df['length'] = pd.to_timedelta(df['length'])
    return df

In [206]:
df_yt = pd.DataFrame()
ignore = ['apache', 'c++', 'financial statement']

for skill in log_progress(DATA_SKILLS):
    if skill.lower() in ignore:
        continue
    df_temp = get_youtube_videos(skill, 'this_year')
    df_yt = df_yt.append(df_temp)
    time.sleep(5)

df_yt.head()

VBox(children=(HTML(value=''), IntProgress(value=0, max=37)))

Unnamed: 0,id,title,channel,published_year,published_month,length,view_count,url,description,skills,data_skills
0,lBBtq3Oawqw,MongoDB Full Course | MongoDB Tutorial | Mongo...,Simplilearn,2021,7,0 days 02:16:59,15375.0,https://www.youtube.com/watch?v=lBBtq3Oawqw,This MongoDB full course will help you under...,Database; MongoDB; NoSQL,Mongo DB; NoSQL
1,Www6cTUymCY,MongoDB Tutorial For Beginners | Full Course,Amigoscode,2021,6,0 days 01:08:31,39019.0,https://www.youtube.com/watch?v=Www6cTUymCY,MongoDB is great database for any project. In...,Database; MongoDB,Mongo DB
2,SnqPyqRh4r4,What Is MongoDB? | What Is MongoDB And How It ...,Simplilearn,2021,4,0 days 00:06:32,23857.0,https://www.youtube.com/watch?v=SnqPyqRh4r4,So here's the list of topics covered: 00:00:00...,MongoDB,Mongo DB
3,ofme2o29ngU,MongoDB Crash Course,Web Dev Simplified,2021,9,0 days 00:29:59,22333.0,https://www.youtube.com/watch?v=ofme2o29ngU,MongoDB Cheat Sheet: https://webdevsimplified...,Database; MongoDB; NoSQL,Mongo DB; NoSQL
4,oSIv-E60NiU,MongoDb Tutorial For Beginners in Hindi 🔥🔥,CodeWithHarry,2021,7,0 days 01:13:36,161229.0,https://www.youtube.com/watch?v=oSIv-E60NiU,"MongoDB crash course: In this video, we will ...",Database; Hindi; MongoDB; NoSQL,Mongo DB; NoSQL


In [220]:
len(df_yt), len(df_yt.drop_duplicates()), len(df_yt.drop_duplicates(subset='id'))

(740, 727, 698)

In [231]:
df_yt = df_yt.drop_duplicates(subset='id')
df_yt = df_yt.sort_values(by=['published_year', 'published_month', 'id'])
# df_yt['view_count'] = df_yt['view_count'].astype(int)
df_yt.head()

Unnamed: 0,id,title,channel,published_year,published_month,length,view_count,url,description,skills,data_skills
14,-T_l8TjAzXk,WHY LEARN TABLEAU? | Is Tableau worth learning...,Abhishek Agarrwal,2020,10,0 days 00:12:04,6634.0,https://www.youtube.com/watch?v=-T_l8TjAzXk,Many people who are a beginner and aspiring to...,Business Intelligence (BI); Business Intellige...,Tableau
11,0nH1ospwJd4,4 Quick Tips to Make an Engaging PowerPoint Pr...,ClassPoint,2020,10,0 days 00:09:36,5578.0,https://www.youtube.com/watch?v=0nH1ospwJd4,Here're 4 quick tips on how to make engaging P...,Animation; Microsoft PowerPoint; Presentation,Presentation Skill
19,1LHch7usbzY,Processing Covid-19 Data with Apache Spark,Manning Publications,2020,10,0 days 01:14:32,1056.0,https://www.youtube.com/watch?v=1LHch7usbzY,"""Spark in Action, Second Edition: Covers Apac...",Apache Spark; Java; Processing; Python; Scala,SPARK; Java; Python Programming
8,2ViNp_UijMQ,Apache Spark with Scala By Example - learn Apa...,Petrina Corine,2020,10,0 days 00:03:46,5.0,https://www.youtube.com/watch?v=2ViNp_UijMQ,Learn the fundamentals and run examplesof Spa...,Apache Spark; Dataset; Distributed Database; S...,SPARK
13,2sTeNU3nE3o,ETL explained in 5 Minutes,DaveSplains,2020,10,0 days 00:04:50,132.0,https://www.youtube.com/watch?v=2sTeNU3nE3o,"The concept of Extract Transform Load , or E...",Extract Transform Load (ETL); Programming; Tra...,ETL


In [232]:
df_yt.to_csv('database/youtube.csv', index=False)