In [3]:
!which python3

/opt/miniconda3/envs/jupyter/bin/python3


In [None]:
import datetime as dt
import dotenv
import json
import os
import re
import requests

from bs4 import BeautifulSoup
from dataclasses import dataclass, field
from functools import partial
from itertools import accumulate, islice, repeat, takewhile

In [None]:
dotenv.load_dotenv()

In [None]:
NEXT_FEED_TEXT = 'Zobacz więcej relacji'
POST_URL_TEXT = 'Pełne zdarzenie'

proto = 'https'
base_uri = 'mbasic.facebook.com'
home_url = f'{proto}://{base_uri}'

tz = dt.timezone(dt.timedelta(hours=2))

In [None]:
# iter utils
def iterate(f, x):
  """return (x, f(x), f(f(x)), ...)"""
  return accumulate(repeat(x), lambda fx, _: f(fx))


def take_nth(n, x):
    """return list(x)[n]"""
    return next(islice(x, n, n+1))

In [None]:
# Scraper

def create_url(path):
    return f"{proto}://{base_uri}/{path.strip('/')}"


def _get_login_data():
    res = requests.get(home_url)
    soup = BeautifulSoup(res.text)
    form = soup.find(attrs={'id': 'login_form'})
    action_url = form.get('action')
    inputs = form.find_all('input', attrs={'type': ['hidden', 'submit']})
    data = {el.get('name'): el.get('value') for el in inputs}
    cookies = res.cookies
    return action_url, data, cookies


def create_session():
    login_url, login_data, login_cookies = _get_login_data()
    login_data['email'] = os.environ['FB_EMAIL']
    login_data['pass'] = os.environ['FB_PASSWORD']
    s = requests.Session()
    res = s.post(create_url(login_url), data=login_data, cookies=login_cookies, allow_redirects=False)
    if res.status_code != 302:
        raise RuntimeError('Error while logging in.')
    return s


def fetch_html(s, url):
    res = s.get(url)
    res.raise_for_status()
    return BeautifulSoup(res.text)


def get_nth_child(n, soup):
    return take_nth(n, soup.children)


get_first_child = partial(get_nth_child, 0)

In [None]:
# Timeline

def get_posts_as_soups(soup):
    return soup.find(attrs={'class': 'feed'}).find().children


def get_next_posts_url(soup):
    el = soup.find(string=NEXT_FEED_TEXT)
    next_posts_url = el.find_parent().find_parent().get('href')
    return create_url(next_posts_url)

In [None]:
from dataclasses import field

In [None]:
# Post

@dataclass
class Post:
    timestamp: dt.datetime
    content: str
    likes: int
    comments: int
    url: str = field(repr=False)


def create_post_from_soup(post):
    try:
        return Post(
            timestamp=get_timestamp(post),
            content=get_content(post),
            likes=get_likes(post),
            comments=get_comments(post),
            url=get_url(post)
        )
    except:
        return None


def get_timestamp(post):
    page_insights = list(json.loads(post.get('data-ft'))['page_insights'].values())[0]
    post_context = page_insights['post_context']
    publish_time = post_context['publish_time']
    return dt.datetime.fromtimestamp(publish_time).astimezone(tz)


def get_content(post):
    paragraph = post.find('p')
    return ' '.join(paragraph.stripped_strings)


def get_likes(post):
    footer = list(post.children)[1]
    return int(footer.a.text)


def get_comments(post):
    footer = list(post.children)[1]
    stats = list(footer.children)[1]
    comments_section = list(stats.children)[2]
    comments_components = comments_section.text.split()
    if comments_components[0].isnumeric():
        comments = int(comments_components[0])
    elif comments_components[-1].isnumeric():
        comments = int(comments_components[-1])
    else:
        comments = 0
    return comments


def get_url(post):
    return post.find(string=POST_URL_TEXT).find_parent().get('href')

In [None]:
s = create_session()

In [None]:
page_id = 'ekstraklasatrolls'
url = create_url(f'{page_id}/?v=timeline')
soup = fetch_html(s, url)

In [None]:
with open('index.html', 'w') as f:
    f.write(str(soup))

In [None]:
# Feed stream

def fetch_feed(s, url):
    soup = fetch_html(s, url)
    posts_soups = get_posts_as_soups(soup)
    posts = [create_post_from_soup(p) for p in posts_soups]
    yield from posts
    
    next_url = get_next_posts_url(soup)
    yield from fetch_feed_stream(s, next_url)


def fetch_feed_stream(s, url):
    soup = fetch_html(s, url)
    contrainer = iterate(get_first_child, soup.find_all('table')[1])
    posts_soups = take_nth(5, contrainer).children
    posts = [create_post_from_soup(p) for p in posts_soups]
    yield from posts
    
    next_url = get_next_posts_url(soup)
    yield from fetch_feed_stream(s, next_url)

In [None]:
my_fetch_feed = partial(fetch_feed, s)

In [None]:
date_stop = dt.date(2023, 7, 20)
posts = takewhile(lambda p: not p or p.timestamp.date() > date_stop, my_fetch_feed(url))

In [None]:
next(posts)