In [1]:
import requests
import random
import pandas as pd
import re
import json
import datetime as dt
from bs4 import BeautifulSoup
from headers import headers_list
from loading_bar import log_progress
from data_skills import SKILLS, SKILL_DICT
from skill_extraction import extract_skills, extract_ignore
from secrets import api_keys

In [2]:
URL = 'https://www.googleapis.com/youtube/v3/search'
ORDERS = ['date', 'rating', 'relevance', 'title', 'videoCount', 'viewCount']

API_KEY = api_keys['youtube']
# Must be in RFC 3339 formatted date-time value (1970-01-01T00:00:00Z)
START_DATE = (dt.datetime.now(dt.timezone.utc) - dt.timedelta(days=365.25)).isoformat()

In [26]:
def get_youtube_videos(query, max_results=50, order='viewCount', start_date=None):
    params = {'q': query, 'part': 'snippet', 'type': 'video', 'maxResults': max_results, 'order': order,
              'key': API_KEY}
    if start_date is not None:
        params['publishedAfter'] = start_date
    page = requests.get(URL, params=params, headers=random.choice(headers_list))
    if page.status_code != 200:
        return None
    res = json.loads(page.content.decode('utf8'))
    video_dict = []
    video_url = 'https://youtube.com/watch?v='
    for i, item in enumerate(res['items']):
        video_id = get_object(item['id'], 'videoId')
        snippet = item['snippet']
        video_dict.append({
            'id': video_id,
            'title': get_object(snippet, 'title'),
            'channel': get_object(snippet, 'channelTitle'),
            'url': video_url + video_id,
            'published_date': get_object(snippet, 'publishTime'),
            'description': get_object(snippet, 'description'),
            'relevance': i+1
        })
    return pd.DataFrame.from_dict(video_dict)

def get_object(item, key):
    try:
        return item[key]
    except:
        return None

In [28]:
df = pd.DataFrame()

for i in log_progress(range(len(SKILLS))):
    skill = SKILLS[i]
    query = 'learn ' + skill
    df_temp = get_youtube_videos(query, order='relevance')
    df_temp['data_skills'] = skill
    df = df.append(df_temp)

df.head()

VBox(children=(HTML(value=''), IntProgress(value=0, max=37)))

Unnamed: 0,id,title,channel,url,published_date,description,relevance,data_skills
0,rCr3-YlL5S8,Basics of Apache Webserver,Amit Nepal,https://youtube.com/watch?v=rCr3-YlL5S8,2019-07-07T18:45:01Z,A video tutorial on the basics of Apache Web S...,1,APACHE
1,0E3RAUIx1Ms,How to play Apache by The Shadows - Guitar Les...,Heanor School Of Music,https://youtube.com/watch?v=0E3RAUIx1Ms,2015-07-19T12:28:54Z,How to play Apache by The Shadows In this guit...,2,APACHE
2,zC9cnh8rJd0,Spark Tutorial | Spark Tutorial for Beginners ...,Great Learning,https://youtube.com/watch?v=zC9cnh8rJd0,2020-04-24T11:55:52Z,Great Learning offers a range of extensive Dat...,3,APACHE
3,L-GRrGAghKk,Bylas Apache Lesson 1 Greetings,Safford JW,https://youtube.com/watch?v=L-GRrGAghKk,2015-11-04T17:53:51Z,This is basic greetings in the Bylas Apache La...,4,APACHE
4,RG77qqGwVls,Bike chalana Sikha,Samiul Aashiqui wala,https://youtube.com/watch?v=RG77qqGwVls,2018-01-19T08:05:30Z,Samiul Islam.,5,APACHE


In [37]:
len(df), len(df.drop_duplicates(subset=['id']))

(1850, 1793)

In [42]:
df = df.merge(df.groupby('id').mean(), left_on='id', right_index=True)
df = df.merge(df[['id', 'data_skills']].groupby('id').agg(lambda x: '; '.join(x)), left_on='id', right_index=True)
df.head()

Unnamed: 0,id,title,channel,url,published_date,description,relevance_x,data_skills_x,relevance_y,data_skills_y
0,rCr3-YlL5S8,Basics of Apache Webserver,Amit Nepal,https://youtube.com/watch?v=rCr3-YlL5S8,2019-07-07T18:45:01Z,A video tutorial on the basics of Apache Web S...,1,APACHE,1.0,APACHE
1,0E3RAUIx1Ms,How to play Apache by The Shadows - Guitar Les...,Heanor School Of Music,https://youtube.com/watch?v=0E3RAUIx1Ms,2015-07-19T12:28:54Z,How to play Apache by The Shadows In this guit...,2,APACHE,2.0,APACHE
2,zC9cnh8rJd0,Spark Tutorial | Spark Tutorial for Beginners ...,Great Learning,https://youtube.com/watch?v=zC9cnh8rJd0,2020-04-24T11:55:52Z,Great Learning offers a range of extensive Dat...,3,APACHE,2.0,APACHE; SPARK
0,zC9cnh8rJd0,Spark Tutorial | Spark Tutorial for Beginners ...,Great Learning,https://youtube.com/watch?v=zC9cnh8rJd0,2020-04-24T11:55:52Z,Great Learning offers a range of extensive Dat...,1,SPARK,2.0,APACHE; SPARK
3,L-GRrGAghKk,Bylas Apache Lesson 1 Greetings,Safford JW,https://youtube.com/watch?v=L-GRrGAghKk,2015-11-04T17:53:51Z,This is basic greetings in the Bylas Apache La...,4,APACHE,4.0,APACHE


In [43]:
df = df.drop_duplicates(subset=['id'])
df = df.drop(columns=['relevance_x', 'data_skills_x'])
df = df.rename(columns={'relevance_y': 'relevance', 'data_skills_y': 'data_skills'})
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,id,title,channel,url,published_date,description,relevance,data_skills
0,rCr3-YlL5S8,Basics of Apache Webserver,Amit Nepal,https://youtube.com/watch?v=rCr3-YlL5S8,2019-07-07T18:45:01Z,A video tutorial on the basics of Apache Web S...,1.0,APACHE
1,0E3RAUIx1Ms,How to play Apache by The Shadows - Guitar Les...,Heanor School Of Music,https://youtube.com/watch?v=0E3RAUIx1Ms,2015-07-19T12:28:54Z,How to play Apache by The Shadows In this guit...,2.0,APACHE
2,zC9cnh8rJd0,Spark Tutorial | Spark Tutorial for Beginners ...,Great Learning,https://youtube.com/watch?v=zC9cnh8rJd0,2020-04-24T11:55:52Z,Great Learning offers a range of extensive Dat...,2.0,APACHE; SPARK
3,L-GRrGAghKk,Bylas Apache Lesson 1 Greetings,Safford JW,https://youtube.com/watch?v=L-GRrGAghKk,2015-11-04T17:53:51Z,This is basic greetings in the Bylas Apache La...,4.0,APACHE
4,RG77qqGwVls,Bike chalana Sikha,Samiul Aashiqui wala,https://youtube.com/watch?v=RG77qqGwVls,2018-01-19T08:05:30Z,Samiul Islam.,5.0,APACHE


In [21]:
def get_full_description(video_id):
    url = 'https://www.googleapis.com/youtube/v3/videos'
    params = {'part': 'snippet', 'id': video_id, 'key': API_KEY}
    page = requests.get(url, params=params, headers=random.choice(headers_list))
    if page.status_code != 200:
        return None
    try:
        res = json.loads(page.content.decode('utf8'))
        return res['items'][0]['snippet']['description']
    except:
        return None

In [22]:
df['description_full'] = None
df['skills'] = None

for i in log_progress(range(0, len(df))):
    row = df.loc[i]
    full_text = row['title']
    if row['description'].endswith('...'):
        full_description = get_full_description(row['id'])
        full_text += ' ' + full_description
        df.loc[i, 'description_full'] = full_description
    else:
        full_text += ' ' + row['description']
    all_skills = extract_skills(full_text)
    keep_skills, _ = extract_ignore(all_skills)
    keep_skills.sort()
    df.loc[i, 'skills'] = '; '.join(keep_skills)

df.head()

VBox(children=(HTML(value=''), IntProgress(value=0, max=1789)))

Unnamed: 0,id,title,channel,url,published_date,description,data_skills,description_full,skills
0,rCr3-YlL5S8,Basics of Apache Webserver,Amit Nepal,https://youtube.com/watch?v=rCr3-YlL5S8,2019-07-07T18:45:01Z,A video tutorial on the basics of Apache Web S...,APACHE,A video tutorial on the basics of Apache Web S...,Apache; Configuration; Operating System; Servi...
1,0E3RAUIx1Ms,How to play Apache by The Shadows - Guitar Les...,Heanor School Of Music,https://youtube.com/watch?v=0E3RAUIx1Ms,2015-07-19T12:28:54Z,How to play Apache by The Shadows In this guit...,APACHE,How to play Apache by The Shadows\n\nIn this g...,Apache; Facebook; Music; Play; Twitter; Youtube
2,zC9cnh8rJd0,Spark Tutorial | Spark Tutorial for Beginners ...,Great Learning,https://youtube.com/watch?v=zC9cnh8rJd0,2020-04-24T11:55:52Z,Great Learning offers a range of extensive Dat...,APACHE,Great Learning offers a range of extensive Dat...,Apache Hadoop; Apache Hive; Apache Spark; Apac...
3,L-GRrGAghKk,Bylas Apache Lesson 1 Greetings,Safford JW,https://youtube.com/watch?v=L-GRrGAghKk,2015-11-04T17:53:51Z,This is basic greetings in the Bylas Apache La...,APACHE,,Apache
4,RG77qqGwVls,Bike chalana Sikha,Samiul Aashiqui wala,https://youtube.com/watch?v=RG77qqGwVls,2018-01-19T08:05:30Z,Samiul Islam.,APACHE,,


In [52]:
df.loc[df['skills'] == '', 'skills'] = None
df.loc[df['skills'].isna()]

Unnamed: 0,id,title,channel,url,published_date,description,relevance,data_skills,description_full,skills
4,RG77qqGwVls,Bike chalana Sikha,Samiul Aashiqui wala,https://youtube.com/watch?v=RG77qqGwVls,2018-01-19T08:05:30Z,Samiul Islam.,5.0,APACHE,,
1598,uMz6LtCOU6A,How to AB Test with Google Optimize,Digital Basics,https://youtube.com/watch?v=uMz6LtCOU6A,2019-03-05T13:08:27Z,Learn how to modify and change parts of your w...,14.0,AB Testing,,
1659,2UphAzryVpY,A different way to visualize rhythm - John Varney,TED-Ed,https://youtube.com/watch?v=2UphAzryVpY,2014-10-20T15:27:13Z,View full lesson: http://ed.ted.com/lessons/a-...,32.0,Visualizations,,


In [54]:
df = df.sort_values(by='id')
df.drop(columns=['description_full']).to_csv('results/youtube.csv', index=False)
df.to_csv('results/youtube_full.csv', index=False)