Importing beautiful soup and pandas to scrape web of medium articles and associated data and then store it in a dataframe for machine learning in next step.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import os

Here are the URLs of archives for seven major publications. We will scrape these articles for the model.

In [2]:
urls = {
    'Towards Data Science': 'https://towardsdatascience.com/archive/{0}/{1:02d}/{2:02d}',
    'UX Collective': 'https://uxdesign.cc/archive/{0}/{1:02d}/{2:02d}',
    'The Startup': 'https://medium.com/swlh/archive/{0}/{1:02d}/{2:02d}',
    'The Writing Cooperative': 'https://writingcooperative.com/archive/{0}/{1:02d}/{2:02d}',
    'Data Driven Investor': 'https://medium.com/datadriveninvestor/archive/{0}/{1:02d}/{2:02d}',
    'Better Humans': 'https://medium.com/better-humans/archive/{0}/{1:02d}/{2:02d}',
    'Better Marketing': 'https://medium.com/better-marketing/archive/{0}/{1:02d}/{2:02d}',
}

In [3]:
def is_leap(year):
    if year % 4 != 0:
        return False
    elif year % 100 != 0:
        return True
    elif year % 400 != 0:
        return False
    else:
        return True
    
def convert_day(day, year):
    month_days = [31, 29 if is_leap(year) else 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    m = 0
    d = 0
    while day > 0:
        d = day
        day -= month_days[m]
        m += 1
    return (m, d)

def get_claps(claps_str):
    if (claps_str is None) or (claps_str == '') or (claps_str.split is None):
        return 0
    split = claps_str.split('K')
    claps = float(split[0])
    claps = int(claps*1000) if len(split) == 2 else int(claps)
    return claps

Below, I chose the year and the days I want to analyze. I chose all the days in 2022.

In [4]:
year = 2022
selected_days = [i for i in range(1, 367 if is_leap(year) else 366)]

Below, I scrape all the articles from all the publications on each day I selected using BeautifulSoup.

In [5]:
data = []
article_id = 0
i = 0
n = len(selected_days)
for d in selected_days:
    i += 1
    month, day = convert_day(d, year)
    date = '{0}-{1:02d}-{2:02d}'.format(year, month, day)
    print(f'{i} / {n} ; {date}')
    for publication, url in urls.items():
        response = requests.get(url.format(year, month, day), allow_redirects=True)
        if not response.url.startswith(url.format(year, month, day)):
            continue
        page = response.content
        soup = BeautifulSoup(page, 'html.parser')
        articles = soup.find_all("div", class_="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls")
        for article in articles:
            title = article.find("h3", class_="graf--title")
            if title is None:
                continue
            title = title.contents[0]
            article_id += 1
            subtitle = article.find("h4", class_="graf--subtitle")
            subtitle = subtitle.contents[0] if subtitle is not None else ''
            article_url = article.find_all("a")[3]['href'].split('?')[0]
            try:
                claps = get_claps(article.find_all("button")[1].contents[0])
            except:
                pass
            reading_time = article.find("span", class_="readingTime")
            reading_time = 0 if reading_time is None else int(reading_time['title'].split(' ')[0])
            responses = article.find_all("a")
            if len(responses) == 7:
                responses = responses[6].contents[0].split(' ')
                if len(responses) == 0:
                    responses = 0
                else:
                    responses = responses[0]
            else:
                responses = 0

            data.append([article_id, article_url, title, subtitle, claps, responses, reading_time, publication, date])




1 / 365 ; 2022-01-01
2 / 365 ; 2022-01-02
3 / 365 ; 2022-01-03
4 / 365 ; 2022-01-04
5 / 365 ; 2022-01-05
6 / 365 ; 2022-01-06
7 / 365 ; 2022-01-07
8 / 365 ; 2022-01-08
9 / 365 ; 2022-01-09
10 / 365 ; 2022-01-10
11 / 365 ; 2022-01-11
12 / 365 ; 2022-01-12
13 / 365 ; 2022-01-13
14 / 365 ; 2022-01-14
15 / 365 ; 2022-01-15
16 / 365 ; 2022-01-16
17 / 365 ; 2022-01-17
18 / 365 ; 2022-01-18
19 / 365 ; 2022-01-19
20 / 365 ; 2022-01-20
21 / 365 ; 2022-01-21
22 / 365 ; 2022-01-22
23 / 365 ; 2022-01-23
24 / 365 ; 2022-01-24
25 / 365 ; 2022-01-25
26 / 365 ; 2022-01-26
27 / 365 ; 2022-01-27
28 / 365 ; 2022-01-28
29 / 365 ; 2022-01-29
30 / 365 ; 2022-01-30
31 / 365 ; 2022-01-31
32 / 365 ; 2022-02-01
33 / 365 ; 2022-02-02
34 / 365 ; 2022-02-03
35 / 365 ; 2022-02-04
36 / 365 ; 2022-02-05
37 / 365 ; 2022-02-06
38 / 365 ; 2022-02-07
39 / 365 ; 2022-02-08
40 / 365 ; 2022-02-09
41 / 365 ; 2022-02-10
42 / 365 ; 2022-02-11
43 / 365 ; 2022-02-12
44 / 365 ; 2022-02-13
45 / 365 ; 2022-02-14
46 / 365 ; 2022-02-

362 / 365 ; 2022-12-28
363 / 365 ; 2022-12-29
364 / 365 ; 2022-12-30
365 / 365 ; 2022-12-31


I then turned these data into a pandas dataframe capturing data on the title, subtitle, claps, etc. I will only be using titles and claps for my model.

In [6]:
medium_df = pd.DataFrame(data, columns=['id', 'url', 'title', 'subtitle', 'claps', 'responses', 'reading_time', 'publication', 'date'])

In [8]:
medium_df.to_csv('medium_data.csv', index=False)