In [1]:
import random
import requests
import pandas as pd
import json
import datetime as dt
import time
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from headers import headers_list
from data_skills import DATA_SKILLS
from skill_extraction import extract_skills, extract_ignore, extract_data_skills
from secrets import settings, api_keys

## JMLR

In [None]:
def jmlr_scraper(engine):
    base_url = 'https://jmlr.org'
    url = base_url + '/papers/v22/'
    page = requests.get(url, headers=random.choice(headers_list))
    if page.status_code != 200:
        return
    soup = BeautifulSoup(page.content, 'html.parser')
    dls = soup.findAll('dl')
    # Get existing papers in database
    df_ex = pd.read_sql_query('select cj.id, cj.title from "ContentJMLR" cj', engine)
    ex_papers = df_ex['title'].unique().tolist()
    papers = []
    # Iterate through each paper
    has_new = False
    for dl in dls:
        title = dl.find('dt').get_text()
        if title in ex_papers:
            continue
        paper = {}
        dd = dl.find('dd')
        paper['title'] = title
        paper['authors'] = dd.get_text().split(';')[0].strip()
        paper['journal_num'] = dd.get_text().split(';')[-1].split('\n')[0].strip()
        for a in dd.findAll('a'):
            if a.get_text() == '(Machine Learning Open Source Software Paper)':
                continue
            href = a['href']
            if 'http' not in href:
                href = 'https://jmlr.org' + href
            paper[a.get_text()] = href
        # Get abstract of paper and extract skills
        output = get_abstract_skills(paper)
        if output is not None:
            paper['abstract'] = output[0]
            if len(output[1]) > 0:
                paper['skills'] = '; '.join(output[1])
                data_skills = extract_data_skills(output[1])
                if len(data_skills) > 0:
                    paper['data_skills'] = '; '.join(data_skills)
        papers.append(paper)
        has_new = True
    # Compile into dataframe if we have new papers
    if has_new:
        df = pd.DataFrame.from_dict(papers)
        df['id'] = df.index + max(df_ex['id']) + 1
        return df
    return None

def get_abstract_skills(paper):
    page = requests.get(paper['abs'], headers=random.choice(headers_list))
    if page.status_code != 200:
        return None
    soup = BeautifulSoup(page.content, 'html.parser')
    abstract = soup.find('p', class_='abstract').get_text().strip('\n')
    all_skills = extract_skills(paper['title'] + ' ' + abstract)
    keep_skills, _ = extract_ignore(all_skills)
    keep_skills.sort()
    return abstract, keep_skills

In [None]:
engine = create_engine(settings['skills_db'])
df_jmlr = jmlr_scraper(engine)
df_jmlr.head()

In [None]:
engine.dispose()

In [None]:
df_jmlr.to_csv('database/jmlr.csv', index=False)

## Youtube

In [None]:
def get_youtube_videos(skill, filter_time=None):
    base_url = 'https://www.youtube.com'
    # Dictionary for filtering search query
    sp_dict = {'this_year': 'EgQIBRAB', 'this_month': 'EgQIBBAB', 'this_week': 'EgQIAxAB', 'today': 'EgQIAhAB'}
    if filter_time not in sp_dict.keys():
        return None
    url = base_url + '/results'
    query = 'learn ' + skill
    params = {'search_query': query.replace(' ', '+')}
    # Default is no filter
    if filter_time is not None:
        params['sp'] = sp_dict[filter_time]
    page = requests.get(url, params=params, headers=random.choice(headers_list))
    if page.status_code != 200:
        print(page, page.reason)
        return None
    soup = BeautifulSoup(page.content, 'html.parser')
    json_text = str(soup.find_all('script')).split('var ytInitialData = ')[-1].split(';</script>')[0]
    res = json.loads(json_text)
    res = res['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents']
    video_list = []
    # Iterate through each video
    for contents in res:
        # Get only those with video
        if 'itemSectionRenderer' not in contents:
            continue
        contents = contents['itemSectionRenderer']['contents']
        for content in contents:
            # Ignore ads
            if 'videoRenderer' not in content:
                continue
            content = content['videoRenderer']
            title = get_text(content, 'title')
            if title is None:
                continue
            description = get_description(content)
            skills, data_skills = get_skills(title, description)
            published_year, published_month = get_published_date(content)
            video_list.append({
                'id': content['videoId'],
                'title': title,
                'channel': get_text(content, 'ownerText'),
                'published_year': published_year,
                'published_month': published_month,
                'length': get_length(content),
                'view_count': get_view_count(content),
                'url': get_url(content),
                'description': description,
                'skills': skills,
                'data_skills': data_skills
            })
    df = pd.DataFrame.from_dict(video_list)
    # df['length'] = pd.to_timedelta(df['length'])
    return df

def get_text(content, info):
    try:
        return ' '.join(t['text'] for t in content[info]['runs'])
    except:
        return None

def get_length(content):
    try:
        length = content['lengthText']['simpleText']
        length = length.split(':')
        if len(length) == 1:
            length.insert(0, '00')
        if len(length) == 2:
            length.insert(0, '00')
        return ':'.join(length)
    except:
        return None

def get_published_date(content):
    try:
        published_time = content['publishedTimeText']['simpleText']
        val = [int(s) for s in published_time.split() if s.isdigit()][0]
        current = dt.datetime.now()
        if 'year' in published_time:
            published = current - dt.timedelta(days=365.25*val)
        elif 'month' in published_time:
            published = current - dt.timedelta(days=30.436875*val)
        elif 'week' in published_time:
            published = current - dt.timedelta(weeks=val)
        elif 'day' in published_time:
            published = current - dt.timedelta(days=val)
        elif 'hour' in published_time:
            published = current - dt.timedelta(hours=val)
        elif 'minute' in published_time:
            published = current - dt.timedelta(minutes=val)
        elif 'second' in published_time:
            published = current - dt.timedelta(seconds=val)
        return published.year, published.month
    except:
        return None, None

def get_view_count(content):
    try:
        view_count = content['viewCountText']['simpleText']
        view_count = view_count.split(' views')[0].replace(',', '')
        return int(view_count)
    except:
        return None

def get_url(content):
    try:
        url = content['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url']
        return base_url + url
    except:
        return None

def get_description(content):
    try:
        description = ' '.join([t['text'] for t in content['detailedMetadataSnippets'][0]['snippetText']['runs']])
        return description
    except:
        return None

def get_skills(title, description):
    context = title
    if description is not None:
        context = context + ' ' + description
    all_skills = extract_skills(context)
    # Ignore the Video skill as it is not relevant for Youtube
    if 'Video' in all_skills:
        all_skills.remove('Video')
    keep_skills, _ = extract_ignore(all_skills)
    keep_skills.sort()
    if len(keep_skills) > 0:
        data_skills = extract_data_skills(keep_skills)
        if len(data_skills) > 0:
            return '; '.join(keep_skills), '; '.join(data_skills)
        return '; '.join(keep_skills), None
    return None, None

In [None]:
df_yt = pd.DataFrame()

for skill in DATA_SKILLS:
    try:
        df_temp = get_youtube_videos(skill, 'this_month')
        df_yt = df_yt.append(df_temp)
    except Exception as e:
        print('Error in scraping Youtube for {}'.format(skill), e)
    time.sleep(5)

df_yt.head()

In [None]:
engine = create_engine(settings['skills_db2'])
df_yt = pd.read_csv('database/youtube.csv')
df_yt['length'] = df_yt['length'].apply(lambda x: x.split()[-1])
df_yt.head()

In [None]:
df_yt.to_sql('ContentYoutube', engine, index=False, if_exists='replace')
engine.dispose()

In [None]:
len(df_yt), len(df_yt[~df_yt['data_skills'].isna()])

## Medium

In [2]:
def medium_scraper(tag, date):
    base_url = 'https://medium.com/tag/{}/archive/'
    url = base_url.format(tag) + date.strftime('%Y/%m/%d')
    page = requests.get(url, headers=random.choice(headers_list))
    soup = BeautifulSoup(page.content, 'html.parser')
    # Pulls each card from the feed. Each card is a story or comment
    cards = soup.find_all('div', class_='streamItem streamItem--postPreview js-streamItem')
    card_list = []
    for card in cards:
        title = get_title(card)
        subtitle = get_subtitle(card)
        claps = get_claps(card)
        if title is None or is_comment(card) or claps is None:
            continue
        if claps < 100:
            continue
        skills, data_skills = get_skills(title, subtitle)
        card_list.append({
            'id': get_id(card),
            'title': title,
            'subtitle': subtitle,
            'author': get_author(card),
            'publication': get_publication(card),
            'published_date': date,
            'read_time_mins': get_read_time(card),
            'claps': claps,
            'url': get_url(card),
            'skills': skills,
            'data_skills': data_skills,
        })
    df = pd.DataFrame.from_dict(card_list)
    return df

def get_id(card):
    id_ = card.find('div', class_='postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls')
    if id_ is not None:
        return id_['data-post-id']
    return id_

def get_title(card):
    # Different combination of classes possible for titles
    combinations = [('h3', 'graf graf--h3 graf-after--figure graf--title'),
                    ('h3', 'graf graf--h3 graf-after--figure graf--trailing graf--title'),
                    ('h4', 'graf graf--h4 graf--leading'),
                    ('h3', 'graf graf--h3 graf--leading graf--title'),
                    ('p', 'graf graf--p graf--leading'),
                    ('h3', 'graf graf--h3 graf--startsWithDoubleQuote graf--leading graf--title'),
                    ('h3', 'graf graf--h3 graf--startsWithDoubleQuote graf-after--figure graf--trailing graf--title')]
    title = None
    for combi in combinations:
        title = card.find(combi[0], class_=combi[1])
        if title is not None:
            return title.text
    return title

def get_subtitle(card):
    # Different combination of classes possible for subtitles
    combinations = [('h4', 'graf graf--h4 graf-after--h3 graf--subtitle'),
                    ('h4', 'graf graf--h4 graf-after--h3 graf--trailing graf--subtitle'),
                    ('strong', 'markup--strong markup--p-strong'),
                    ('h4', 'graf graf--p graf-after--h3 graf--trailing'),
                    ('p', 'graf graf--p graf-after--h3 graf--trailing'),
                    ('blockquote', 'graf graf--pullquote graf-after--figure graf--trailing'),
                    ('p', 'graf graf--p graf-after--figure'),
                    ('blockquote', 'graf graf--blockquote graf-after--h3 graf--trailing'),
                    ('p', 'graf graf--p graf-after--figure graf--trailing'),
                    ('em', 'markup--em markup--p-em'),
                    ('p', 'graf graf--p graf-after--p graf--trailing')]
    subtitle = None
    for combi in combinations:
        subtitle = card.find(combi[0], class_=combi[1])
        if subtitle is not None:
            return subtitle.text
    return subtitle

def get_author(card):
    author = card.find('a', class_='ds-link ds-link--styleSubtle link link--darken link--accent u-accentColor--textNormal u-accentColor--textDarken')
    if author is not None:
        return author.text
    return author

def get_publication(card):
    pub = card.find('a', class_='ds-link ds-link--styleSubtle link--darken link--accent u-accentColor--textNormal')
    if pub is not None:
        return pub.text
    return pub

def get_read_time(card):
    time = card.find('span', class_='readingTime')
    if time is not None:
        time = time['title']
        return time.replace(' min read', '')
    return time

def get_claps(card):
    claps = card.find('button', class_='button button--chromeless u-baseColor--buttonNormal js-multirecommendCountButton u-disablePointerEvents')
    if claps is not None:
        claps = claps.text
        if 'K' in claps:
            try:
                return int(float(claps.replace('K', '')) * 1000)
            except:
                return None
        else:
            try:
                return int(claps)
            except:
                return None
    return claps

def is_comment(card):
    # Check if card is a story or comment
    comment = card.find('div', class_='u-fontSize14 u-marginTop10 u-marginBottom20 u-padding14 u-xs-padding12 u-borderRadius3 u-borderCardBackground u-borderLighterHover u-boxShadow1px4pxCardBorder')
    return comment is not None

def get_url(card):
    url = card.find('a', class_='')
    if url is not None:
        return url['href'].split('?')[0]
    return url

def get_skills(title, subtitle):
    context = title
    if subtitle is not None:
        context = context + ' ' + subtitle
    all_skills = extract_skills(context)
    keep_skills, _ = extract_ignore(all_skills)
    keep_skills.sort()
    if len(keep_skills) > 0:
        data_skills = extract_data_skills(keep_skills)
        if len(data_skills) > 0:
            return '; '.join(keep_skills), '; '.join(data_skills)
        return '; '.join(keep_skills), None
    return None, None

In [None]:
tag = 'data-science'
start_date = dt.datetime(2021, 9, 30)
end_date = dt.datetime(2021, 10, 18)
current_date = start_date

for i in range((end_date - start_date).days):
    df = medium_scraper(tag, current_date)
    if i == 0:
        df.to_csv('database/medium_data_science.csv', index=False)
    else:
        df.to_csv('database/medium_data_science.csv', index=False, mode='a', header=False)
    current_date = current_date + dt.timedelta(days=1)
    time.sleep(3)

In [None]:
tag = 'machine-learning'
current_date = start_date

for i in range((end_date - start_date).days):
    df = medium_scraper(tag, current_date)
    if i == 0:
        df.to_csv('database/medium_machine_learning.csv', index=False)
    else:
        df.to_csv('database/medium_machine_learning.csv', index=False, mode='a', header=False)
    current_date = current_date + dt.timedelta(days=1)
    time.sleep(3)

In [None]:
tag = 'data-engineering'
current_date = start_date

for i in range((end_date - start_date).days):
    df = medium_scraper(tag, current_date)
    if i == 0:
        df.to_csv('database/medium_data_engineering.csv', index=False)
    else:
        df.to_csv('database/medium_data_engineering.csv', index=False, mode='a', header=False)
    current_date = current_date + dt.timedelta(days=1)
    time.sleep(3)

In [None]:
df_med = pd.read_csv('database/medium_data_science.csv')
df_med = df_med.append(pd.read_csv('database/medium_machine_learning.csv'))
df_med = df_med.append(pd.read_csv('database/medium_data_engineering.csv'))
df_med = df_med.drop_duplicates(subset=['id'])
df_med['published_date'] = pd.to_datetime(df_med['published_date'], dayfirst=True)
df_med = df_med.sort_values(by=['published_date', 'id'])
df_med['url'] = df_med['url'].apply(lambda x: x.split('?')[0])
df_med = df_med.reset_index(drop=True)
df_med.head()

In [None]:
df_med.to_csv('database/medium.csv', index=False)

In [3]:
start_date = dt.datetime(2022, 3, 26)
end_date = dt.datetime(2022, 4, 13)
current_date = start_date
for i in range((end_date - start_date).days):
    print(current_date)
    current_date = current_date + dt.timedelta(days=1)

2022-03-26 00:00:00
2022-03-27 00:00:00
2022-03-28 00:00:00
2022-03-29 00:00:00
2022-03-30 00:00:00
2022-03-31 00:00:00
2022-04-01 00:00:00
2022-04-02 00:00:00
2022-04-03 00:00:00
2022-04-04 00:00:00
2022-04-05 00:00:00
2022-04-06 00:00:00
2022-04-07 00:00:00
2022-04-08 00:00:00
2022-04-09 00:00:00
2022-04-10 00:00:00
2022-04-11 00:00:00
2022-04-12 00:00:00


In [4]:
tags = ['data-science', 'machine-learning', 'data-engineering']
# start_date = dt.datetime(2021, 12, 14)
# end_date = dt.datetime(2021, 12, 29)
df_med = pd.DataFrame()

for tag in tags:
    current_date = start_date
    print(tag)
    for i in range((end_date - start_date).days):
        df_temp = medium_scraper(tag, current_date)
        df_med = df_med.append(df_temp)
        current_date = current_date + dt.timedelta(days=1)
        time.sleep(random.randint(1,3))

df_med.head()

data-science
machine-learning
data-engineering


Unnamed: 0,id,title,subtitle,author,publication,published_date,read_time_mins,claps,url,skills,data_skills
0,2995910e564,Validate Your pandas DataFrame with Pandera,Make Sure Your Data Matches Your Expectation,Khuyen Tran,Towards Data Science,2022-03-26,5,454,https://towardsdatascience.com/validate-your-p...,Dataframe; Pandas,
1,fc8867d9b774,How To Really Understand The Raven Paradox?,A paradox that rocks the foundations of scient...,Hemanth,Street Science,2022-03-26,7,293,https://medium.com/street-science/how-to-reall...,Rally,
2,6b631334ce19,A Rant On Why I Despise Jupyter Notebooks,I remember one day messaging one of my seniors...,Agrover112,CodeX,2022-03-26,4,146,https://medium.com/codex/an-honest-rant-on-why...,Jupyter Notebook,
3,38124b21309a,Başarılı olmaktan korkma! Veri Bilimci olma yo...,,Serdar Tafralı,,2022-03-26,5,109,https://medium.com/@serdartafrali/ba%C5%9Far%C...,,
4,384f49b60d16,Plotly Callbacks: Create Exciting Interactive ...,Get started with callbacks in Plotly Dash,Anmol Tomar,Towards Data Science,2022-03-26,5,145,https://towardsdatascience.com/are-you-still-c...,Dash; Plotly,


In [5]:
df_med = df_med.sort_values(by=['published_date', 'id'])
df_med = df_med.drop_duplicates(subset=['id'])
df_med

Unnamed: 0,id,title,subtitle,author,publication,published_date,read_time_mins,claps,url,skills,data_skills
0,2995910e564,Validate Your pandas DataFrame with Pandera,Make Sure Your Data Matches Your Expectation,Khuyen Tran,Towards Data Science,2022-03-26,5,454,https://towardsdatascience.com/validate-your-p...,Dataframe; Pandas,
0,2d1e9039f376,Transformer models: an introduction and catalo...,Update 04/02/2022,Xavier Amatriain,,2022-03-26,17,282,https://medium.com/@xamat/transformers-models-...,Transformer,
3,38124b21309a,Başarılı olmaktan korkma! Veri Bilimci olma yo...,,Serdar Tafralı,,2022-03-26,5,109,https://medium.com/@serdartafrali/ba%C5%9Far%C...,,
4,384f49b60d16,Plotly Callbacks: Create Exciting Interactive ...,Get started with callbacks in Plotly Dash,Anmol Tomar,Towards Data Science,2022-03-26,5,145,https://towardsdatascience.com/are-you-still-c...,Dash; Plotly,
6,5bf3513e7743,The Most Effective Data Scientists Are Also Et...,,Rebeca Ansar,An Amygdala,2022-03-26,2,173,https://medium.com/an-amygdala/the-most-effect...,,
...,...,...,...,...,...,...,...,...,...,...,...
11,ab8a784aa2d5,Veri Bilimi ile Tavsiye Sistemleri,Tavsiye sistemleri kullanıcılara bazı teknikle...,Serdar Tafralı,,2022-04-12,6,131,https://medium.com/@serdartafrali/veri-bilimi-...,C; R,R programming
4,b33921b4f17f,MLOps or How to Deploy Data Science at Scale,Extending AI & ML in the industry,Thibaud Lamothe 🤠,Towards Data Science,2022-04-12,11,138,https://towardsdatascience.com/mlops-or-how-to...,Artificial Intelligence (AI); Data Science; MLOps,AI
10,d49d2590ff9b,Upserting Pandas Dataframes to Snowflake,Automate your pipelines once and for all.,Chloé Lagrue,Better Programming,2022-04-12,4,105,https://betterprogramming.pub/upserting-pandas...,Dataframe; Pandas,
1,d6dbb155673c,Beginner’s Guide to Machine Learning with Big ...,A tutorial for working with large datasets using…,Nathaniel DiRenzo,Towards Data Science,2022-04-12,14,151,https://towardsdatascience.com/beginners-guide...,Big Data; Dataset; Machine Learning,


In [6]:
engine = create_engine(settings['skills_db2'])
# df_med = pd.read_csv('database/medium.csv')
df_med['published_date'] = pd.to_datetime(df_med['published_date'])
# df_med.to_sql('ContentMedium', engine, index=False, if_exists='replace')
df_med.to_sql('ContentMedium', engine, index=False, if_exists='append')
engine.dispose()

## KDnuggets

In [None]:
df_kd = pd.read_csv('database/kdnuggets.csv')
df_kd['date'] = pd.to_datetime(df_kd['date'])
df_kd.head()

In [None]:
df_kd.info()

In [None]:
engine = create_engine(settings['skills_db2'])
df_kd.to_sql('ContentKDnuggets', engine, index=False, if_exists='replace')
engine.dispose()

In [None]:
dt.datetime.now() - dt.timedelta(days=7)

In [None]:
dt.datetime.now() - dt.timedelta(days=7)