In [12]:
import random
import requests
import pandas as pd
import json
import datetime as dt
import time
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from headers import headers_list
from data_skills import DATA_SKILLS
from skill_extraction import extract_skills, extract_ignore, extract_data_skills
from secrets import settings, api_keys

## JMLR

In [None]:
def jmlr_scraper(engine):
    base_url = 'https://jmlr.org'
    url = base_url + '/papers/v22/'
    page = requests.get(url, headers=random.choice(headers_list))
    if page.status_code != 200:
        return
    soup = BeautifulSoup(page.content, 'html.parser')
    dls = soup.findAll('dl')
    # Get existing papers in database
    df_ex = pd.read_sql_query('select cj.id, cj.title from "ContentJMLR" cj', engine)
    ex_papers = df_ex['title'].unique().tolist()
    papers = []
    # Iterate through each paper
    has_new = False
    for dl in dls:
        title = dl.find('dt').get_text()
        if title in ex_papers:
            continue
        paper = {}
        dd = dl.find('dd')
        paper['title'] = title
        paper['authors'] = dd.get_text().split(';')[0].strip()
        paper['journal_num'] = dd.get_text().split(';')[-1].split('\n')[0].strip()
        for a in dd.findAll('a'):
            if a.get_text() == '(Machine Learning Open Source Software Paper)':
                continue
            href = a['href']
            if 'http' not in href:
                href = 'https://jmlr.org' + href
            paper[a.get_text()] = href
        # Get abstract of paper and extract skills
        output = get_abstract_skills(paper)
        if output is not None:
            paper['abstract'] = output[0]
            if len(output[1]) > 0:
                paper['skills'] = '; '.join(output[1])
                data_skills = extract_data_skills(output[1])
                if len(data_skills) > 0:
                    paper['data_skills'] = '; '.join(data_skills)
        papers.append(paper)
        has_new = True
    # Compile into dataframe if we have new papers
    if has_new:
        df = pd.DataFrame.from_dict(papers)
        df['id'] = df.index + max(df_ex['id']) + 1
        return df
    return None

def get_abstract_skills(paper):
    page = requests.get(paper['abs'], headers=random.choice(headers_list))
    if page.status_code != 200:
        return None
    soup = BeautifulSoup(page.content, 'html.parser')
    abstract = soup.find('p', class_='abstract').get_text().strip('\n')
    all_skills = extract_skills(paper['title'] + ' ' + abstract)
    keep_skills, _ = extract_ignore(all_skills)
    keep_skills.sort()
    return abstract, keep_skills

In [None]:
engine = create_engine(settings['skills_db'])
df_jmlr = jmlr_scraper(engine)
df_jmlr.head()

In [None]:
engine.dispose()

In [None]:
df_jmlr.to_csv('database/jmlr.csv', index=False)

## Youtube

In [None]:
def get_youtube_videos(skill, filter_time=None):
    base_url = 'https://www.youtube.com'
    # Dictionary for filtering search query
    sp_dict = {'this_year': 'EgQIBRAB', 'this_month': 'EgQIBBAB', 'this_week': 'EgQIAxAB', 'today': 'EgQIAhAB'}
    if filter_time not in sp_dict.keys():
        return None
    url = base_url + '/results'
    query = 'learn ' + skill
    params = {'search_query': query.replace(' ', '+')}
    # Default is no filter
    if filter_time is not None:
        params['sp'] = sp_dict[filter_time]
    page = requests.get(url, params=params, headers=random.choice(headers_list))
    if page.status_code != 200:
        print(page, page.reason)
        return None
    soup = BeautifulSoup(page.content, 'html.parser')
    json_text = str(soup.find_all('script')).split('var ytInitialData = ')[-1].split(';</script>')[0]
    res = json.loads(json_text)
    res = res['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents']
    video_list = []
    # Iterate through each video
    for contents in res:
        # Get only those with video
        if 'itemSectionRenderer' not in contents:
            continue
        contents = contents['itemSectionRenderer']['contents']
        for content in contents:
            # Ignore ads
            if 'videoRenderer' not in content:
                continue
            content = content['videoRenderer']
            title = get_text(content, 'title')
            if title is None:
                continue
            description = get_description(content)
            skills, data_skills = get_skills(title, description)
            published_year, published_month = get_published_date(content)
            video_list.append({
                'id': content['videoId'],
                'title': title,
                'channel': get_text(content, 'ownerText'),
                'published_year': published_year,
                'published_month': published_month,
                'length': get_length(content),
                'view_count': get_view_count(content),
                'url': get_url(content),
                'description': description,
                'skills': skills,
                'data_skills': data_skills
            })
    df = pd.DataFrame.from_dict(video_list)
    # df['length'] = pd.to_timedelta(df['length'])
    return df

def get_text(content, info):
    try:
        return ' '.join(t['text'] for t in content[info]['runs'])
    except:
        return None

def get_length(content):
    try:
        length = content['lengthText']['simpleText']
        length = length.split(':')
        if len(length) == 1:
            length.insert(0, '00')
        if len(length) == 2:
            length.insert(0, '00')
        return ':'.join(length)
    except:
        return None

def get_published_date(content):
    try:
        published_time = content['publishedTimeText']['simpleText']
        val = [int(s) for s in published_time.split() if s.isdigit()][0]
        current = dt.datetime.now()
        if 'year' in published_time:
            published = current - dt.timedelta(days=365.25*val)
        elif 'month' in published_time:
            published = current - dt.timedelta(days=30.436875*val)
        elif 'week' in published_time:
            published = current - dt.timedelta(weeks=val)
        elif 'day' in published_time:
            published = current - dt.timedelta(days=val)
        elif 'hour' in published_time:
            published = current - dt.timedelta(hours=val)
        elif 'minute' in published_time:
            published = current - dt.timedelta(minutes=val)
        elif 'second' in published_time:
            published = current - dt.timedelta(seconds=val)
        return published.year, published.month
    except:
        return None, None

def get_view_count(content):
    try:
        view_count = content['viewCountText']['simpleText']
        view_count = view_count.split(' views')[0].replace(',', '')
        return int(view_count)
    except:
        return None

def get_url(content):
    try:
        url = content['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url']
        return base_url + url
    except:
        return None

def get_description(content):
    try:
        description = ' '.join([t['text'] for t in content['detailedMetadataSnippets'][0]['snippetText']['runs']])
        return description
    except:
        return None

def get_skills(title, description):
    context = title
    if description is not None:
        context = context + ' ' + description
    all_skills = extract_skills(context)
    # Ignore the Video skill as it is not relevant for Youtube
    if 'Video' in all_skills:
        all_skills.remove('Video')
    keep_skills, _ = extract_ignore(all_skills)
    keep_skills.sort()
    if len(keep_skills) > 0:
        data_skills = extract_data_skills(keep_skills)
        if len(data_skills) > 0:
            return '; '.join(keep_skills), '; '.join(data_skills)
        return '; '.join(keep_skills), None
    return None, None

In [None]:
df_yt = pd.DataFrame()

for skill in DATA_SKILLS:
    try:
        df_temp = get_youtube_videos(skill, 'this_month')
        df_yt = df_yt.append(df_temp)
    except Exception as e:
        print('Error in scraping Youtube for {}'.format(skill), e)
    time.sleep(5)

df_yt.head()

In [None]:
engine = create_engine(settings['skills_db2'])
df_yt = pd.read_csv('database/youtube.csv')
df_yt['length'] = df_yt['length'].apply(lambda x: x.split()[-1])
df_yt.head()

In [None]:
df_yt.to_sql('ContentYoutube', engine, index=False, if_exists='replace')
engine.dispose()

In [None]:
len(df_yt), len(df_yt[~df_yt['data_skills'].isna()])

## Medium

In [2]:
def medium_scraper(tag, date):
    base_url = 'https://medium.com/tag/{}/archive/'
    url = base_url.format(tag) + date.strftime('%Y/%m/%d')
    page = requests.get(url, headers=random.choice(headers_list))
    soup = BeautifulSoup(page.content, 'html.parser')
    # Pulls each card from the feed. Each card is a story or comment
    cards = soup.find_all('div', class_='streamItem streamItem--postPreview js-streamItem')
    card_list = []
    for card in cards:
        title = get_title(card)
        subtitle = get_subtitle(card)
        claps = get_claps(card)
        if title is None or is_comment(card) or claps is None:
            continue
        if claps < 100:
            continue
        skills, data_skills = get_skills(title, subtitle)
        card_list.append({
            'id': get_id(card),
            'title': title,
            'subtitle': subtitle,
            'author': get_author(card),
            'publication': get_publication(card),
            'published_date': date,
            'read_time_mins': get_read_time(card),
            'claps': claps,
            'url': get_url(card),
            'skills': skills,
            'data_skills': data_skills,
        })
    df = pd.DataFrame.from_dict(card_list)
    return df

def get_id(card):
    id_ = card.find('div', class_='postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls')
    if id_ is not None:
        return id_['data-post-id']
    return id_

def get_title(card):
    # Different combination of classes possible for titles
    combinations = [('h3', 'graf graf--h3 graf-after--figure graf--title'),
                    ('h3', 'graf graf--h3 graf-after--figure graf--trailing graf--title'),
                    ('h4', 'graf graf--h4 graf--leading'),
                    ('h3', 'graf graf--h3 graf--leading graf--title'),
                    ('p', 'graf graf--p graf--leading'),
                    ('h3', 'graf graf--h3 graf--startsWithDoubleQuote graf--leading graf--title'),
                    ('h3', 'graf graf--h3 graf--startsWithDoubleQuote graf-after--figure graf--trailing graf--title')]
    title = None
    for combi in combinations:
        title = card.find(combi[0], class_=combi[1])
        if title is not None:
            return title.text
    return title

def get_subtitle(card):
    # Different combination of classes possible for subtitles
    combinations = [('h4', 'graf graf--h4 graf-after--h3 graf--subtitle'),
                    ('h4', 'graf graf--h4 graf-after--h3 graf--trailing graf--subtitle'),
                    ('strong', 'markup--strong markup--p-strong'),
                    ('h4', 'graf graf--p graf-after--h3 graf--trailing'),
                    ('p', 'graf graf--p graf-after--h3 graf--trailing'),
                    ('blockquote', 'graf graf--pullquote graf-after--figure graf--trailing'),
                    ('p', 'graf graf--p graf-after--figure'),
                    ('blockquote', 'graf graf--blockquote graf-after--h3 graf--trailing'),
                    ('p', 'graf graf--p graf-after--figure graf--trailing'),
                    ('em', 'markup--em markup--p-em'),
                    ('p', 'graf graf--p graf-after--p graf--trailing')]
    subtitle = None
    for combi in combinations:
        subtitle = card.find(combi[0], class_=combi[1])
        if subtitle is not None:
            return subtitle.text
    return subtitle

def get_author(card):
    author = card.find('a', class_='ds-link ds-link--styleSubtle link link--darken link--accent u-accentColor--textNormal u-accentColor--textDarken')
    if author is not None:
        return author.text
    return author

def get_publication(card):
    pub = card.find('a', class_='ds-link ds-link--styleSubtle link--darken link--accent u-accentColor--textNormal')
    if pub is not None:
        return pub.text
    return pub

def get_read_time(card):
    time = card.find('span', class_='readingTime')
    if time is not None:
        time = time['title']
        return time.replace(' min read', '')
    return time

def get_claps(card):
    claps = card.find('button', class_='button button--chromeless u-baseColor--buttonNormal js-multirecommendCountButton u-disablePointerEvents')
    if claps is not None:
        claps = claps.text
        if 'K' in claps:
            try:
                return int(float(claps.replace('K', '')) * 1000)
            except:
                return None
        else:
            try:
                return int(claps)
            except:
                return None
    return claps

def is_comment(card):
    # Check if card is a story or comment
    comment = card.find('div', class_='u-fontSize14 u-marginTop10 u-marginBottom20 u-padding14 u-xs-padding12 u-borderRadius3 u-borderCardBackground u-borderLighterHover u-boxShadow1px4pxCardBorder')
    return comment is not None

def get_url(card):
    url = card.find('a', class_='')
    if url is not None:
        return url['href'].split('?')[0]
    return url

def get_skills(title, subtitle):
    context = title
    if subtitle is not None:
        context = context + ' ' + subtitle
    all_skills = extract_skills(context)
    keep_skills, _ = extract_ignore(all_skills)
    keep_skills.sort()
    if len(keep_skills) > 0:
        data_skills = extract_data_skills(keep_skills)
        if len(data_skills) > 0:
            return '; '.join(keep_skills), '; '.join(data_skills)
        return '; '.join(keep_skills), None
    return None, None

In [None]:
tag = 'data-science'
start_date = dt.datetime(2021, 1, 1)
end_date = dt.datetime(2021, 9, 22)
current_date = start_date

for i in range((end_date - start_date).days):
    df = medium_scraper(tag, current_date)
    if i == 0:
        df.to_csv('database/medium_data_science.csv', index=False)
    else:
        df.to_csv('database/medium_data_science.csv', index=False, mode='a', header=False)
    current_date = current_date + dt.timedelta(days=1)
    time.sleep(3)

In [None]:
tag = 'machine-learning'
start_date = dt.datetime(2021, 1, 1)
end_date = dt.datetime(2021, 9, 22)
current_date = start_date

for i in range((end_date - start_date).days):
    df = medium_scraper(tag, current_date)
    if i == 0:
        df.to_csv('database/medium_machine_learning.csv', index=False)
    else:
        df.to_csv('database/medium_machine_learning.csv', index=False, mode='a', header=False)
    current_date = current_date + dt.timedelta(days=1)
    time.sleep(3)

In [None]:
tag = 'data-engineering'
start_date = dt.datetime(2021, 1, 1)
end_date = dt.datetime(2021, 9, 22)
current_date = start_date

for i in range((end_date - start_date).days):
    df = medium_scraper(tag, current_date)
    if i == 0:
        df.to_csv('database/medium_data_engineering.csv', index=False)
    else:
        df.to_csv('database/medium_data_engineering.csv', index=False, mode='a', header=False)
    current_date = current_date + dt.timedelta(days=1)
    time.sleep(3)

In [None]:
df_de = pd.read_csv('database/medium_data_engineering.csv')
df_de.head()

In [None]:
df_med = pd.read_csv('database/medium_data_science.csv')
df_med = df_med.append(pd.read_csv('database/medium_machine_learning.csv'))
df_med = df_med.append(pd.read_csv('database/medium_data_engineering.csv'))
df_med = df_med.drop_duplicates(subset=['id'])
df_med['published_date'] = pd.to_datetime(df_med['published_date'], dayfirst=True)
df_med = df_med.sort_values(by=['published_date', 'id'])
df_med['url'] = df_med['url'].apply(lambda x: x.split('?')[0])
df_med = df_med.reset_index(drop=True)
df_med.head()

In [None]:
df_med.to_csv('database/medium.csv', index=False)

In [7]:
tags = ['data-science', 'machine-learning', 'data-engineering']
start_date = dt.datetime(2021, 9, 25)
end_date = dt.datetime(2021, 9, 28)
df_med = pd.DataFrame()

for tag in tags:
    current_date = start_date
    for i in range((end_date - start_date).days):
        df_temp = medium_scraper(tag, current_date)
        df_med = df_med.append(df_temp)
        current_date = current_date + dt.timedelta(days=1)
        time.sleep(3)

df_med.head()

Unnamed: 0,id,title,subtitle,author,publication,published_date,read_time_mins,claps,url,skills,data_skills
0,86eb2a96bec3,Stop Hardcoding Sensitive Data in Your Python ...,Keep your settings and credentials private…,Ahmed Besbes,Towards Data Science,2021-09-25,5,619,https://towardsdatascience.com/stop-hardcoding...,Python,Python Programming
1,c2cb58895bbe,Best OpenSource AutoML frameworks in 2021,A curated list of popular,Tech Ninja,Technology Now and Next,2021-09-25,6,208,https://medium.com/technexthere/best-opensourc...,,
2,d3657b7b5526,You can use Low Code Python in Jupyter (Updated),,Jake from Mito,,2021-09-25,4,109,https://medium.com/@jjdiamondreivich/low-code-...,Jupyter; Python,Python Programming
3,d404539a5c49,Start Analyzing Your Investment Risks With 4 F...,An in-depth look at…,Justin Jimenez,DataDrivenInvestor,2021-09-25,11,100,https://medium.datadriveninvestor.com/start-an...,Investment; Statistics,Statistics
4,60b01c41c485,How To Apply a Function To Columns in Pandas,Discussing when to use apply() or map() when a...,Giorgos Myrianthous,Towards Data Science,2021-09-25,3,105,https://towardsdatascience.com/apply-function-...,Pandas,


In [10]:
df_med = df_med.sort_values(by=['published_date', 'id'])
df_med = df_med.drop_duplicates(subset=['id'])
df_med

Unnamed: 0,id,title,subtitle,author,publication,published_date,read_time_mins,claps,url,skills,data_skills
2,2af9b74de418,Where are Self-Driving Cars??,What is Preventing Self-Driving Cars From Beco...,Fazal Mittu,Visionary Hub,2021-09-25,6,158,https://medium.com/visionary-hub/where-are-sel...,,
0,30f114f1d4a7,Data Engineering,The ETL and ELT are necessary for data science...,Bereket Kibru,,2021-09-25,3,105,https://medium.com/@bekakibru2/data-engineerin...,Data Engineering; Data Science; Database; Extr...,ETL; NoSQL; SQL
3,3a606d4aadcd,NEAT Spiking Neural Networks for Reinforcement...,Paper Summary: “Evolving Spiking Neural…,Dickson Wu,Geek Culture,2021-09-25,9,201,https://medium.com/geekculture/neat-spiking-ne...,Neural Network; Reinforcement Learning,Reinforcement
1,409d4397f74e,Rise of the Streaming Databases — Episode 2 : ...,How Pinot solves the toughest problems in…,Dunith Dhanushka,Event-driven Utopia,2021-09-25,8,168,https://medium.com/event-driven-utopia/rise-of...,Apache; Database; Streaming,Apache
4,60b01c41c485,How To Apply a Function To Columns in Pandas,Discussing when to use apply() or map() when a...,Giorgos Myrianthous,Towards Data Science,2021-09-25,3,105,https://towardsdatascience.com/apply-function-...,Pandas,
0,86eb2a96bec3,Stop Hardcoding Sensitive Data in Your Python ...,Keep your settings and credentials private…,Ahmed Besbes,Towards Data Science,2021-09-25,5,619,https://towardsdatascience.com/stop-hardcoding...,Python,Python Programming
6,a33a2084ae98,Real-time Artwork Generation using Deep Learning,Adaptive Instance Normalisation(AdaIN) for Style…,Aadhithya Sankar,Towards Data Science,2021-09-25,6,118,https://towardsdatascience.com/real-time-artwo...,Deep Learning,Deep Learning
1,c2cb58895bbe,Best OpenSource AutoML frameworks in 2021,A curated list of popular,Tech Ninja,Technology Now and Next,2021-09-25,6,208,https://medium.com/technexthere/best-opensourc...,,
4,c607fe973dff,How to get stock data in Python,Stock market clustering in python,Gautamankul,,2021-09-25,2,359,https://medium.com/@gautamankul/how-to-get-sto...,Clustering; Python; Stock Market,Python Programming
7,c65c3111cd39,Relational and Non-relational Database,"Data is the new oil of business as they say, b...",Ezekiel Ajayi,,2021-09-25,3,156,https://medium.com/@ajayiezekiel9000/relationa...,Relational Database,


In [11]:
engine = create_engine(settings['skills_db2'])
# df_med = pd.read_csv('database/medium.csv')
df_med['published_date'] = pd.to_datetime(df_med['published_date'])
# df_med.to_sql('ContentMedium', engine, index=False, if_exists='replace')
df_med.to_sql('ContentMedium', engine, index=False, if_exists='append')
engine.dispose()

## KDnuggets

In [16]:
df_kd = pd.read_csv('database/kdnuggets.csv')
df_kd['date'] = pd.to_datetime(df_kd['date'])
df_kd.head()

Unnamed: 0,id,title,author,date,url,description,type,tags,skills,data_skills
0,121033,Six Tips on Building a Data Science Team at a ...,Zbar & Vallejo,2021-01-04,https://www.kdnuggets.com/2021/01/six-tips-bui...,When a company decides that they want to start...,opinions,"Data Science, Data Science Team, Data Scientist",Data Science,
1,121054,All Machine Learning Algorithms You Should Kno...,Terence Shin,2021-01-04,https://www.kdnuggets.com/2021/01/machine-lear...,Many machine learning algorithms exits that ra...,tutorials,"Algorithms, Decision Trees, Explained, Gradien...",Bayesian Regression; Boosting; Decision Tree; ...,Regressions
2,121083,DeepMind’s MuZero is One of the Most Important...,Jesus Rodriguez,2021-01-04,https://www.kdnuggets.com/2021/01/deepmind-muz...,MuZero takes a unique approach to solve the pr...,tutorials,"AlphaZero, Deep Learning, DeepMind, MuZero, Re...",Deep Learning; Reinforcement Learning,Deep Learning; Reinforcement
3,121109,"Model Experiments, Tracking and Registration u...",Dash Desai,2021-01-05,https://www.kdnuggets.com/2021/01/model-experi...,This post covers how StreamSets can help exped...,tutorials,"Data Science, Databricks, DataOps, Experimenta...",Data Science; Databricks; MLFlow; MLOps; Machi...,
4,121117,How to Get a Job as a Data Engineer,Anna Anisienia,2021-01-05,https://www.kdnuggets.com/2021/01/get-job-as-d...,Data engineering skills are currently in high ...,opinions,"Career Advice, Data Engineer, Data Engineering",Data Engineering; Sentry,


In [17]:
df_kd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 587 entries, 0 to 586
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           587 non-null    int64         
 1   title        587 non-null    object        
 2   author       587 non-null    object        
 3   date         587 non-null    datetime64[ns]
 4   url          587 non-null    object        
 5   description  587 non-null    object        
 6   type         587 non-null    object        
 7   tags         587 non-null    object        
 8   skills       582 non-null    object        
 9   data_skills  391 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(8)
memory usage: 46.0+ KB


In [19]:
engine = create_engine(settings['skills_db2'])
df_kd.to_sql('ContentKDnuggets', engine, index=False, if_exists='replace')
engine.dispose()