In [2]:
import requests
from bs4 import BeautifulSoup
import os
import datetime
from datetime import timedelta, datetime

import json
from requests_html import HTMLSession
import pickle

from collections import defaultdict
import logging

from medium_scraper.utils import check_archive
from dotenv import load_dotenv

import pandas as pd


class ProfileScraper:
    def __init__(self):
        self.sess = HTMLSession()

        with open("payload", "rb") as f:
            payload = pickle.load(f)

        self.payload = payload

        self.data = []

    def extract(self, url):
        self.url = url
        self.url_response = self.sess.get(url)
        for s in self.url_response.html.find("script"):
            if "window.__PRELOADED_STATE__" in s.text:
                script = s
        self.preload_data = json.loads(script.text.split("window.__PRELOADED_STATE__ = ")[1])
        self.userid = self.preload_data['client']['routingEntity']['id']

        t = round((datetime.today() - timedelta(days=8)).timestamp()) * 1000

        self.variables = {"homepagePostsFrom":f"L{t}", # the time
                        "includeDistributedResponses":True,
                        "id":self.userid,
                        "username":None,
                        "homepagePostsLimit":25}
        
        data = self.scrape()

        return data


    def _send_request(self, timestamp = None):
        if timestamp:
            self.variables['homepagePostsFrom'] = timestamp

        self.payload[0]['variables'] = self.variables

        r = self.sess.post(os.path.join(self.url, "_", "graphql"), json=self.payload)
        rjson = defaultdict(str)
        for k,v in r.json()[0].items():
            rjson[k] = v
        
        try:
            paging_info = rjson.get("data").get('userResult').get('homepagePostsConnection').get('pagingInfo')
            posts = rjson.get('data').get('userResult').get('homepagePostsConnection').get('posts')
        except json.JSONDecodeError:
            print(r.headers)
            print(r.text)
            raise Exception("An error occurred while decoding the response JSON.")

        return posts, paging_info

    def scrape(self):
        data = []
        posts, paging_info = self._send_request()

        while len(posts)>0:
            extracted_data = self.extract_data(posts)
            data.extend(extracted_data)

            next_info = paging_info.get("next")
            if next_info:
                next_timestamp = next_info['from']
            else:
                break

            posts, paging_info = self._send_request(next_timestamp)
        
        return data

    def extract_data(self, posts):
        data = []
        # need urls for author avatar, and post image id
        # also gotta process the tags
        for post in posts:
            # store the data about the creator in `creator_data` dictionary
            creator_data = post.get("creator")
            if creator_data: # this'll ensure the returned data will not be empty. The entries will be `None` if the value cannot be found
                author = creator_data.get("name")

                # get the author's avatar
                author_avatar_id = creator_data.get("imageId")
                if author_avatar_id:
                    author_avatar_url = os.path.join("https://miro.medium.com/", author_avatar_id)

                # extract the membership date, convert it to datetime if it exists
                membership_date = creator_data.get("mediumMemberAt")
                if membership_date:
                    membership_date = str(datetime.fromtimestamp(membership_date/1000))

                author_bio = creator_data.get("bio")

                # extract the follower count if socialStats exists in the response data
                num_followers = creator_data.get("socialStats")
                if num_followers:
                    num_followers = num_followers.get("followerCount")
                
                # number of subscribers registered in the newsletter v3 
                v3_newsletter = creator_data.get("newsletterV3")
                if v3_newsletter:
                    v3_newsletter_subs = v3_newsletter.get("subscribersCount")


            # get and convert the latest publication date if it exist
            date = post.get("latestPublishedAt")
            if date:
                date = str(datetime.fromtimestamp(date/1000))

            # get the reading time if it can be found, map it using math.floor
            reading_time = post.get("readingTime")
            if reading_time:
                reading_time = reading_time - (reading_time % 1)
            
            # extra data, no need to postprocess
            title = post.get("title")
            post_url = post.get("mediumUrl")
            claps = post.get("clapCount")
            num_unique_clappers = post.get("voterCount")
            
            # extract the image_id, then construct the url. 
            # image_url will be None if there is no `previewImage` in the response
            image_id = post.get("previewImage")
            if image_id:
                image_id = image_id.get("id")
                image_url = os.path.join("https://miro.medium.com/", image_id)
            else:
                image_url = None
            
            # get the number of responses, None if there aren't any
            num_responses =  post.get("postResponses")
            if num_responses:
                num_responses = num_responses.get("count")

            # get the tags, let it be None if there aren't any tags. 
            tags = post.get("tags")
            if tags:
                temp_var = []
                for tag in tags:
                    temp_var.append(tag['id'])
                tags = temp_var

            # store all the data in a dictionary, then append it to the `data` list.
            entry = {
                "author": author,
                "author_avatar_url": author_avatar_url ,
                "membership_date": membership_date ,
                "author_bio": author_bio ,
                "num_followers": num_followers ,
                "v3_newsletter_subs": v3_newsletter_subs ,
                "date": date ,
                "reading_time": reading_time ,
                "title": title ,
                "post_url": post_url ,
                "claps": claps ,
                "num_unique_clappers": num_unique_clappers ,
                "image_url": image_url ,
                "num_responses": num_responses ,
                "tags": tags ,
            }

            data.append(entry)
        
        return data

In [4]:
scraper = ProfileScraper()

def extract(urls):
    dataset = {}
    for url in urls:
        print("Scraping url:", url)
        data = scraper.extract(url)
        dataset[url] = data

    data = []
    for author_url, posts in dataset.items():
        for post in posts:
            post['author_url'] = author_url
            data.append(post)

    return data


load_dotenv()
DATASET_PATH = os.getenv("DATASET_PATH")

with open("authors.txt", "r") as f:
    author_urls = f.readlines()

author_urls = list(map(lambda s: s.strip().strip("\n"), author_urls))

dataset = extract(author_urls)

Scraping url: https://timdenning.medium.com/
Scraping url: https://hasanaboulhasan.medium.com/
Scraping url: https://cfhorgan.medium.com/
Scraping url: https://dessyperalt.medium.com/
Scraping url: https://anonwit.medium.com/


In [14]:
data[0]

{'author': 'Tim Denning',
 'author_avatar_url': 'https://miro.medium.com/1*bfllCILGW4yHKXgFo8JkHg.jpeg',
 'membership_date': '2021-10-07 13:55:47',
 'author_bio': 'Aussie Blogger with 500M+ views — Writer for CNBC & Business Insider. Inspiring the world through Personal Development and Entrepreneurship — timdenning.com/mb',
 'num_followers': 312483,
 'v3_newsletter_subs': 3355,
 'date': '2023-02-27 22:07:03.532000',
 'reading_time': 4.0,
 'title': 'How I Feel Highly Motivated at Times I Should Feel Lazy as Hell (Without Drugs or Hacks)',
 'post_url': 'https://medium.com/mind-cafe/how-i-feel-highly-motivated-at-times-i-should-feel-lazy-as-hell-without-drugs-or-hacks-daf475830c8e',
 'claps': 761,
 'num_unique_clappers': 60,
 'image_url': 'https://miro.medium.com/1*XK7lrP-R03_mkPBXQstmlQ.jpeg',
 'num_responses': 13,
 'tags': ['motivation', 'self-improvement', 'life', 'productivity', 'society'],
 'author_url': 'https://timdenning.medium.com/'}

In [15]:
DATASET_PATH = os.getenv("DATASET_PATH")

In [16]:
authors = pd.read_csv(os.path.join(DATASET_PATH, "raw_dataset.csv"))
authors

Unnamed: 0,author,author_avatar_url,membership_date,author_bio,num_followers,v3_newsletter_subs,date,reading_time,title,post_url,claps,num_unique_clappers,image_url,num_responses,tags,author_url
0,Tim Denning,https://miro.medium.com/1*bfllCILGW4yHKXgFo8Jk...,2021-10-07 13:55:47,Aussie Blogger with 500M+ views — Writer for C...,312483,3355,2023-02-27 22:07:03.532000,4.0,How I Feel Highly Motivated at Times I Should ...,https://medium.com/mind-cafe/how-i-feel-highly...,761,60,https://miro.medium.com/1*XK7lrP-R03_mkPBXQstm...,13,"['motivation', 'self-improvement', 'life', 'pr...",https://timdenning.medium.com/
1,Tim Denning,https://miro.medium.com/1*bfllCILGW4yHKXgFo8Jk...,2021-10-07 13:55:47,Aussie Blogger with 500M+ views — Writer for C...,312483,3355,2023-02-27 16:31:24.188000,6.0,Money Habits Keeping You Poor — Ex-Banker Expl...,https://medium.com/swlh/money-habits-keeping-y...,1426,170,https://miro.medium.com/1*3lljDthLJJNeU1mMcjgy...,22,"['money', 'entrepreneurship', 'business', 'wor...",https://timdenning.medium.com/
2,Tim Denning,https://miro.medium.com/1*bfllCILGW4yHKXgFo8Jk...,2021-10-07 13:55:47,Aussie Blogger with 500M+ views — Writer for C...,312483,3355,2023-02-24 10:32:41.574000,4.0,Why Not Quit Your Job in 2023 (And Access Pers...,https://medium.com/swlh/why-not-quit-your-job-...,1573,170,https://miro.medium.com/1*h3yAJOg_s7zxOlZ-TuGh...,33,"['entrepreneurship', 'artificial-intelligence'...",https://timdenning.medium.com/
3,Tim Denning,https://miro.medium.com/1*bfllCILGW4yHKXgFo8Jk...,2021-10-07 13:55:47,Aussie Blogger with 500M+ views — Writer for C...,312483,3355,2023-02-23 00:31:21.655000,5.0,8 Hard Truths About Making Money Online That E...,https://medium.com/swlh/8-hard-truths-about-ma...,1205,82,https://miro.medium.com/1*-d6wDJvbNIqTFgJGSENv...,17,"['entrepreneurship', 'money', 'startup', 'advi...",https://timdenning.medium.com/
4,Tim Denning,https://miro.medium.com/1*bfllCILGW4yHKXgFo8Jk...,2021-10-07 13:55:47,Aussie Blogger with 500M+ views — Writer for C...,312483,3355,2023-02-21 21:48:26.784000,4.0,Here’s How to Get Unlikely Strangers to Respec...,https://medium.com/mind-cafe/heres-how-to-get-...,1073,68,https://miro.medium.com/1*tR1AgtveDv3oJ5EY1mO1...,14,"['relationships', 'life', 'self-improvement', ...",https://timdenning.medium.com/
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3189,Thanos,https://miro.medium.com/1*EWgg9232zpLPBvd_UG6U...,2022-11-25 20:18:07,Soon to be MD. Here to make your life better o...,1748,85,2023-02-22 22:23:27.104000,3.0,You Can’t Argue With People.,https://medium.com/illumination/you-cant-argue...,36,6,https://miro.medium.com/1*zspMLeyjBOeduBYhRpOq...,8,"['debate', 'self-improvement', 'education', 'p...",https://anonwit.medium.com/
3190,Thanos,https://miro.medium.com/1*EWgg9232zpLPBvd_UG6U...,2022-11-25 20:18:07,Soon to be MD. Here to make your life better o...,1748,85,2023-01-22 16:22:51.562000,3.0,Here’s Why Your Comfort Zone Sucks.,https://medium.com/illumination/heres-why-your...,12,4,https://miro.medium.com/1*yzFyMkRWp3cVZ1DfT6MA...,0,"['self-improvement', 'challenge', 'growth', 'p...",https://anonwit.medium.com/
3191,Thanos,https://miro.medium.com/1*EWgg9232zpLPBvd_UG6U...,2022-11-25 20:18:07,Soon to be MD. Here to make your life better o...,1748,85,2023-01-28 17:39:19.840000,3.0,"I Quit Porn For 1 Year, Here’s What Happened.",https://medium.com/illumination/i-quit-porn-fo...,54,3,https://miro.medium.com/0*uQppI5t_yng68FnU,1,"['pornography', 'self-improvement', 'self-awar...",https://anonwit.medium.com/
3192,Thanos,https://miro.medium.com/1*EWgg9232zpLPBvd_UG6U...,2022-11-25 20:18:07,Soon to be MD. Here to make your life better o...,1748,85,2023-01-22 15:29:45.798000,2.0,Ask Yourself This Simple Question To Know If Y...,https://anonwit.medium.com/ask-yourself-this-s...,4,2,https://miro.medium.com/0*zAH9F1XnTwALXd3-,0,"['love', 'philosophy', 'self-improvement', 'th...",https://anonwit.medium.com/


In [19]:
df.isnull().sum()

author                 0
author_avatar_url      0
membership_date        0
author_bio             0
num_followers          0
v3_newsletter_subs     0
date                   0
reading_time           0
title                  0
post_url               0
claps                  0
num_unique_clappers    0
image_url              0
num_responses          0
tags                   0
author_url             0
dtype: int64

In [None]:
average df.groupby("author")