In [69]:
import requests
import random
import pandas as pd
import re
import json
import datetime as dt
import time
from bs4 import BeautifulSoup
from headers import headers_list
from loading_bar import log_progress
from data_skills import SKILLS, SKILL_DICT
from skill_extraction import extract_skills, extract_ignore
from secrets import api_keys

In [56]:
BASE_URL = "https://medium.com/tag/{}/archive/"
START_DATE = dt.datetime.now() - dt.timedelta(days=365.25)
TAGS = ['data-science', 'machine-learning']
START_DATE

datetime.datetime(2020, 9, 29, 5, 35, 51, 809110)

In [61]:
def get_id(card):
    id_ = card.find('div', class_='postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls')
    if id_ is not None:
        return id_['data-post-id']
    return id_

def get_title(card):
    # Different combination of classes possible for titles
    combinations = [('h3', 'graf graf--h3 graf-after--figure graf--title'),
                    ('h3', 'graf graf--h3 graf-after--figure graf--trailing graf--title'),
                    ('h4', 'graf graf--h4 graf--leading'),
                    ('h3', 'graf graf--h3 graf--leading graf--title'),
                    ('p', 'graf graf--p graf--leading'),
                    ('h3', 'graf graf--h3 graf--startsWithDoubleQuote graf--leading graf--title'),
                    ('h3', 'graf graf--h3 graf--startsWithDoubleQuote graf-after--figure graf--trailing graf--title')]
    title = None
    for combi in combinations:
        title = card.find(combi[0], class_=combi[1])
        if title is not None:
            return title.text
    return title

def get_subtitle(card):
    # Different combination of classes possible for subtitles
    combinations = [('h4', 'graf graf--h4 graf-after--h3 graf--subtitle'),
                    ('h4', 'graf graf--h4 graf-after--h3 graf--trailing graf--subtitle'),
                    ('strong', 'markup--strong markup--p-strong'),
                    ('h4', 'graf graf--p graf-after--h3 graf--trailing'),
                    ('p', 'graf graf--p graf-after--h3 graf--trailing'),
                    ('blockquote', 'graf graf--pullquote graf-after--figure graf--trailing'),
                    ('p', 'graf graf--p graf-after--figure'),
                    ('blockquote', 'graf graf--blockquote graf-after--h3 graf--trailing'),
                    ('p', 'graf graf--p graf-after--figure graf--trailing'),
                    ('em', 'markup--em markup--p-em'),
                    ('p', 'graf graf--p graf-after--p graf--trailing')]
    subtitle = None
    for combi in combinations:
        subtitle = card.find(combi[0], class_=combi[1])
        if subtitle is not None:
            return subtitle.text
    return subtitle

def get_author(card):
    author = card.find('a', class_='ds-link ds-link--styleSubtle link link--darken link--accent u-accentColor--textNormal u-accentColor--textDarken')
    if author is not None:
        return author.text
    return author

def get_publication(card):
    pub = card.find('a', class_='ds-link ds-link--styleSubtle link--darken link--accent u-accentColor--textNormal')
    if pub is not None:
        return pub.text
    return pub

def get_read_time(card):
    time = card.find('span', class_='readingTime')
    if time is not None:
        time = time['title']
        return time.replace(' min read', '')
    return time

def get_claps(card):
    claps = card.find('button', class_='button button--chromeless u-baseColor--buttonNormal js-multirecommendCountButton u-disablePointerEvents')
    if claps is not None:
        return claps.text
    return claps

def is_comment(card):
    # Check if card is a story or comment
    comment = card.find('div', class_='u-fontSize14 u-marginTop10 u-marginBottom20 u-padding14 u-xs-padding12 u-borderRadius3 u-borderCardBackground u-borderLighterHover u-boxShadow1px4pxCardBorder')
    return comment is not None

def get_url(card):
    url = card.find('a', class_='')
    if url is not None:
        return url['href']
    return url

def get_skills(title, subtitle):
    context = title
    if subtitle is not None:
        context = context + ' ' + subtitle
    all_skills = extract_skills(context)
    keep_skills, _ = extract_ignore(all_skills)
    keep_skills.sort()
    if len(keep_skills) > 0:
        return '; '.join(keep_skills)
    else:
        return None

In [62]:
def get_posts(tag, date):
    url = BASE_URL.format(tag) + date.strftime('%Y/%m/%d')
    page = requests.get(url, headers=random.choice(headers_list))
    soup = BeautifulSoup(page.content, 'html.parser')
    # Pulls each card from the feed. Each card is a story or comment
    cards = soup.find_all('div', class_='streamItem streamItem--postPreview js-streamItem')
    card_list = []
    for card in cards:
        title = get_title(card)
        subtitle = get_subtitle(card)
        if title is None or is_comment(card):
            continue
        card_list.append({
            'id': get_id(card),
            'title': title,
            'subtitle': subtitle,
            'author': get_author(card),
            'publication': get_publication(card),
            'published_date': date,
            'read_time_mins': get_read_time(card),
            'claps': get_claps(card),
            'url': get_url(card),
            'skills': get_skills(title, subtitle)
        })
    df = pd.DataFrame.from_dict(card_list)
    return df

In [71]:
tag = 'data-science'
start_date = dt.datetime(2021, 1, 2)
end_date = dt.datetime(2021, 9, 28)
current_date = start_date

for i in log_progress(range((end_date - start_date).days)):
    df = get_posts(tag, current_date)
    df.to_csv('results/medium_data_science.csv', index=False, mode='a', header=False)
    current_date = current_date + dt.timedelta(days=1)
    time.sleep(3)

VBox(children=(HTML(value=''), IntProgress(value=0, max=269)))

In [73]:
tag = 'machine-learning'
start_date = dt.datetime(2021, 1, 2)
end_date = dt.datetime(2021, 9, 28)
current_date = start_date

for i in log_progress(range((end_date - start_date).days)):
    df = get_posts(tag, current_date)
    if i == 0:
        df.to_csv('results/medium_machine_learning.csv', index=False)
    else:
        df.to_csv('results/medium_machine_learning.csv', index=False, mode='a', header=False)
    current_date = current_date + dt.timedelta(days=1)
    time.sleep(3)

VBox(children=(HTML(value=''), IntProgress(value=0, max=269)))

In [79]:
df_ds = pd.read_csv('results/medium_data_science.csv')
df_ml = pd.read_csv('results/medium_machine_learning.csv')
len(df_ds), len(df_ml)

(29947, 28479)

In [87]:
df = df_ds.append(df_ml)
# df = df.drop_duplicates(subset=['id', 'title'])
len(df)

58426

In [89]:
len(df['id'].unique()), len(df['title'].unique()), len(df['url'].unique())

(47148, 45880, 57914)

In [111]:
df = df.drop_duplicates(subset=['id'])
df = df.sort_values(by=['published_date', 'id'])
df.to_csv('results/medium.csv', index=False)