### Initial setup

In [3]:
# Importing all basic libs

import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import re
import time

In [4]:
# BBNaija 2024 thread base url
BASE_URL = "https://www.nairaland.com/8156758/bbnaija-2024-live-updates-thread"

# Custom headers to mimic a browser to prevent bot detection
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' +
                  '(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
}

### Helper func to parse one page

In [None]:
def parse_page(page_num):
    url = BASE_URL if page_num == 1 else f"{BASE_URL}/{page_num}"
    response = requests.get(url, headers=HEADERS)

    if response.status_code != 200:
        print(f"Failed to fetch page {page_num}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    posts = []

    rows = soup.select('table[summary="posts"] tr')
    for i in range(0, len(rows), 2):
        try:
            # First row: metadata
            meta_td = rows[i].find('td', class_='bold l pu')
            if not meta_td:
                continue

            username_tag = meta_td.find('a', class_='user')
            username = username_tag.text.strip() if username_tag else ""

            timestamp_parts = meta_td.find_all('b')
            timestamp = ' '.join([b.text.strip() for b in timestamp_parts]) if timestamp_parts else ""

            # Gender is not explicitly labeled, so we leave it blank for now
            gender = ""

            # Second row: post content
            content_td = rows[i+1].find('td', class_='l w pd')
            content_div = content_td.find('div', class_='narrow') if content_td else None
            post_text = content_div.get_text(separator=' ', strip=True) if content_div else ""

            # Likes and Shares (if mentioned)
            like_text = content_td.find('p', class_='s') if content_td else None
            likes = 0
            shares = 0
            if like_text:
                like_match = re.search(r'(\d+)\s+Like', like_text.text)
                share_match = re.search(r'(\d+)\s+Share', like_text.text)
                if like_match:
                    likes = int(like_match.group(1))
                if share_match:
                    shares = int(share_match.group(1))

            posts.append({
                'username': username,
                'gender': gender,
                'timestamp': timestamp,
                'post_text': post_text,
                'likes': likes,
                'shares': shares,
                'page_number': page_num,
                'post_url': url
            })

        except Exception as e:
            print(f"Error parsing post on page {page_num}: {e}")
            continue

    return posts
