In [2]:
import requests
import random
import pandas as pd
import re
import json
import datetime as dt
from bs4 import BeautifulSoup
from headers import headers_list
from loading_bar import log_progress
from skill_extraction import extract_skills, extract_ignore
from secrets import api_keys

In [7]:
URL = 'https://www.googleapis.com/youtube/v3/search'
ORDERS = ['date', 'rating', 'relevance', 'title', 'videoCount', 'viewCount']

API_KEY = api_keys['youtube']
# Must be in RFC 3339 formatted date-time value (1970-01-01T00:00:00Z)
START_DATE = (dt.datetime.now(dt.timezone.utc) - dt.timedelta(days=365.25)).isoformat()

In [8]:
def get_youtube_videos(query, max_results=50, order='viewCount'):
    params = {'q': query, 'part': 'snippet', 'type': 'video', 'maxResults': max_results, 'order': order,
          'publishedAfter': START_DATE, 'key': API_KEY}
    page = requests.get(URL, params=params, headers=random.choice(headers_list))
    if page.status_code != 200:
        return None
    res = json.loads(page.content.decode('utf8'))
    video_dict = []
    video_url = 'https://youtube.com/watch?v='
    for item in res['items']:
        video_id = get_object(item['id'], 'videoId')
        snippet = item['snippet']
        video_dict.append({
            'id': video_id,
            'title': get_object(snippet, 'title'),
            'channel': get_object(snippet, 'channelTitle'),
            'url': video_url + video_id,
            'published_date': get_object(snippet, 'publishTime'),
            'description': get_object(snippet, 'description')
        })
    return pd.DataFrame.from_dict(video_dict)

def get_object(item, key):
    try:
        return item[key]
    except:
        return None

In [9]:
df_ds = get_youtube_videos('data science')
df_ds.head()

Unnamed: 0,id,title,channel,url,published_date,description
0,fmLPS6FBbac,Become a DATA ANALYST with NO degree?!? The Go...,Luke Barousse,https://youtube.com/watch?v=fmLPS6FBbac,2021-03-17T19:50:14Z,Google Certificate is here!
1,MHPGeQD8TvI,How I would learn to code (if I could start over),Tina Huang,https://youtube.com/watch?v=MHPGeQD8TvI,2021-04-25T03:00:08Z,"In this video, I give you my step by step proc..."
2,UPxfw6Mpq5A,ये Free IBM Courses जीवन बदल देंगी | 6-Month F...,fzFact,https://youtube.com/watch?v=UPxfw6Mpq5A,2021-04-11T12:39:48Z,Learn all Data Analyst skills for free and get...
3,pKvWD0f18Pc,Day in the Life of a Data Analyst - SurveyMonk...,Shashank Kalanithi,https://youtube.com/watch?v=pKvWD0f18Pc,2021-02-08T04:27:24Z,NEWEST Day in the Life of a Data Analyst: http...
4,-JWgi9jupmU,Learn Data Science Online with DataCamp!,DataCamp,https://youtube.com/watch?v=-JWgi9jupmU,2020-10-21T13:49:49Z,Start learning today: https://www.datacamp.com.


In [10]:
df_ai = get_youtube_videos('artificial intelligence')
df_ai.head()

Unnamed: 0,id,title,channel,url,published_date,description
0,9jkRcrM6XKA,&quot;I Tried To Warn You&quot; - Elon Musk LA...,Elon Musk Zone,https://youtube.com/watch?v=9jkRcrM6XKA,2021-07-20T10:55:39Z,Become a MUSK FAN & get access to perks: https...
1,jpewc52m4tg,Viz.ai: Improving Outcomes for Stroke Victims ...,Amazon Web Services,https://youtube.com/watch?v=jpewc52m4tg,2021-03-03T16:10:51Z,"Viz.ai, an Israeli-based technology provider, ..."
2,VdG4gUTowXc,A.I. Designed this Car,Donut Media,https://youtube.com/watch?v=VdG4gUTowXc,2020-10-13T16:00:05Z,Thanks to Raycon for sponsoring today's episod...
3,bXCYBsG6ork,Historical Figures Recreated From Paintings Us...,Mystery Scoop,https://youtube.com/watch?v=bXCYBsG6ork,2020-12-16T18:00:06Z,A digital artist is teaching machines how to i...
4,SiZZkrEyw5Q,What They Don&#39;t Want You To See. Boston Dy...,Digital Engine,https://youtube.com/watch?v=SiZZkrEyw5Q,2021-04-03T10:35:23Z,Why do Boston Dynamics Robots dance? And Could...


In [11]:
df_ml = get_youtube_videos('machine learning')
df_ml.head()

Unnamed: 0,id,title,channel,url,published_date,description
0,P1IcaBn3ej0,Enhancing Photorealism Enhancement,Vladlen Koltun and Collaborators,https://youtube.com/watch?v=P1IcaBn3ej0,2021-05-10T20:13:00Z,Enhancing Photorealism Enhancement Stephan R. ...
1,revk5r5vqxA,FPS Games Are About to Die,Basically Homeless,https://youtube.com/watch?v=revk5r5vqxA,2021-07-13T18:00:25Z,New Machine Learning AI Hacks have been develo...
2,ZZs4R-5u4g4,Machine Learning: A Whale Tale - The Big Idea,Amazon Web Services,https://youtube.com/watch?v=ZZs4R-5u4g4,2021-02-15T18:43:53Z,"Meet Lisa Steiner, a marine biologist who has ..."
3,xfzGZB4HhEE,Algorithmic Trading Using Python - Full Course,freeCodeCamp.org,https://youtube.com/watch?v=xfzGZB4HhEE,2020-12-04T17:26:37Z,Learn how to perform algorithmic trading using...
4,W-9jIVm0-p8,Top 10 Certifications For 2021 | Highest Payin...,Simplilearn,https://youtube.com/watch?v=W-9jIVm0-p8,2020-12-03T13:30:03Z,Explore our FREE Courses: ...


In [12]:
df = df_ds.append(df_ai)
df = df.append(df_ml)
df = df.sort_values(by='id')
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,id,title,channel,url,published_date,description
0,-0WQnwNFqJM,AI learns to Speedrun QWOP using Machine Learning,Wesley Liao,https://youtube.com/watch?v=-0WQnwNFqJM,2021-02-26T01:49:53Z,UPDATE: AI was able to surpass the World Recor...
1,-JWgi9jupmU,Learn Data Science Online with DataCamp!,DataCamp,https://youtube.com/watch?v=-JWgi9jupmU,2020-10-21T13:49:49Z,Start learning today: https://www.datacamp.com.
2,-q3d3lD0fFM,Elon Musk Says AI Will Take Over in 5 Years-Ho...,Tech Flake,https://youtube.com/watch?v=-q3d3lD0fFM,2021-03-05T01:19:30Z,Elon Musk Says AI Will Take Over in 5 Years - ...
3,0MNVhXEX9to,Reinforcement Learning: Machine Learning Meets...,Steve Brunton,https://youtube.com/watch?v=0MNVhXEX9to,2021-02-12T11:00:00Z,Reinforcement learning is a powerful technique...
4,0PrOA2JK6GQ,Machine Learning Explicado,Ciência Todo Dia,https://youtube.com/watch?v=0PrOA2JK6GQ,2021-02-25T17:01:36Z,Machine learning (ou aprendizado de máquina) é...


In [13]:
def get_full_description(video_id):
    url = 'https://www.googleapis.com/youtube/v3/videos'
    params = {'part': 'snippet', 'id': video_id, 'key': api_key}
    page = requests.get(url, params=params, headers=random.choice(headers_list))
    if page.status_code != 200:
        return None
    try:
        res = json.loads(page.content.decode('utf8'))
        return res['items'][0]['snippet']['description']
    except:
        return None

In [17]:
df['description_full'] = None
df['skills'] = None

for i in log_progress(range(0, len(df))):
    row = df.loc[i]
    full_text = row['title']
    if row['description'].endswith('...'):
        full_description = get_full_description(row['id'])
        full_text += ' ' + full_description
        df.loc[i, 'description_full'] = full_description
    else:
        full_text += ' ' + row['description']
    all_skills = extract_skills(full_text)
    keep_skills, _ = extract_ignore(all_skills)
    keep_skills.sort()
    df.loc[i, 'skills'] = '; '.join(keep_skills)

df.head()

VBox(children=(HTML(value=''), IntProgress(value=0, max=142)))

Unnamed: 0,id,title,channel,url,published_date,description,description_full,skills
0,-0WQnwNFqJM,AI learns to Speedrun QWOP using Machine Learning,Wesley Liao,https://youtube.com/watch?v=-0WQnwNFqJM,2021-02-26T01:49:53Z,UPDATE: AI was able to surpass the World Recor...,UPDATE:\nAI was able to surpass the World Reco...,Artificial Intelligence (AI); Github; Imitatio...
1,-JWgi9jupmU,Learn Data Science Online with DataCamp!,DataCamp,https://youtube.com/watch?v=-JWgi9jupmU,2020-10-21T13:49:49Z,Start learning today: https://www.datacamp.com.,,Data Science
2,-q3d3lD0fFM,Elon Musk Says AI Will Take Over in 5 Years-Ho...,Tech Flake,https://youtube.com/watch?v=-q3d3lD0fFM,2021-03-05T01:19:30Z,Elon Musk Says AI Will Take Over in 5 Years - ...,Elon Musk Says AI Will Take Over in 5 Years - ...,Artificial Intelligence (AI); Youtube
3,0MNVhXEX9to,Reinforcement Learning: Machine Learning Meets...,Steve Brunton,https://youtube.com/watch?v=0MNVhXEX9to,2021-02-12T11:00:00Z,Reinforcement learning is a powerful technique...,Reinforcement learning is a powerful technique...,Algorithm; Decision Process; Machine Learning;...
4,0PrOA2JK6GQ,Machine Learning Explicado,Ciência Todo Dia,https://youtube.com/watch?v=0PrOA2JK6GQ,2021-02-25T17:01:36Z,Machine learning (ou aprendizado de máquina) é...,Machine learning (ou aprendizado de máquina) é...,Instagram; Machine Learning; Twitter; Youtube


In [18]:
df.loc[df['skills'].isna()]

Unnamed: 0,id,title,channel,url,published_date,description,description_full,skills


In [19]:
df.drop(columns=['description_full']).to_csv('results/youtube.csv', index=False)
df.to_csv('results/youtube_full.csv', index=False)