### Basic imports and initializations


In [1]:
# Importing all basic libs

import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import re
import time
import random


Scraping all pages: 100%|██████████| 1020/1020 [56:31<00:00,  3.33s/it]


Saved to nairaland_bbnaija_2024.csv


In [None]:
# BBNaija 2024 thread base url
BASE_URL = "https://www.nairaland.com/8156758/bbnaija-2024-live-updates-thread"

# Custom headers to mimic a browser to prevent bot detection
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' +
    '(KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 ' +
    '(KHTML, like Gecko) Version/16.2 Safari/605.1.15',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' +
    '(KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
]


### Helper function that will help us scrap data of one page

In [None]:
def parse_page(page_num):
    url = BASE_URL if page_num == 1 else f"{BASE_URL}/{page_num}"
    headers = {'User-Agent': random.choice(USER_AGENTS)}

    try:
        response = requests.get(url, headers=headers, timeout=15)
    except requests.exceptions.RequestException as e:
        print(f"Connection error on page {page_num}: {e}")
        return []

    if response.status_code != 200:
        print(f"Failed to fetch page {page_num}, Status Code: {response.status_code}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    posts = []

    rows = soup.select('table[summary="posts"] tr')
    for i in range(0, len(rows), 2):
        try:
            meta_td = rows[i].find('td', class_='bold l pu')
            if not meta_td:
                continue

            username_tag = meta_td.find('a', class_='user')
            username = username_tag.text.strip() if username_tag else ""

            timestamp_parts = meta_td.find_all('b')
            timestamp = ' '.join([b.text.strip() for b in timestamp_parts]) if timestamp_parts else ""

            gender = ""

            content_td = rows[i+1].find('td', class_='l w pd')
            content_div = content_td.find('div', class_='narrow') if content_td else None
            post_text = content_div.get_text(separator=' ', strip=True) if content_div else ""

            like_text = content_td.find('p', class_='s') if content_td else None
            likes, shares = 0, 0
            if like_text:
                like_match = re.search(r'(\d+)\s+Like', like_text.text)
                share_match = re.search(r'(\d+)\s+Share', like_text.text)
                if like_match:
                    likes = int(like_match.group(1))
                if share_match:
                    shares = int(share_match.group(1))

            posts.append({
                'username': username,
                'gender': gender,
                'timestamp': timestamp,
                'post_text': post_text,
                'likes': likes,
                'shares': shares,
                'page_number': page_num,
                'post_url': url
            })

        except Exception as e:
            print(f"Error parsing post on page {page_num}: {e}")
            continue

    return posts

### Running the above helper function on all 1020 pages and then saving as a .csv file

In [None]:
all_posts = []
for page in tqdm(range(1, 1021), desc="Scraping all pages"):
    success = False
    attempts = 0
    while not success and attempts < 3:
        page_posts = parse_page(page)
        if page_posts:
            all_posts.extend(page_posts)
            success = True
        else:
            attempts += 1
            print(f"Retrying page {page}... attempt {attempts}")
            time.sleep(5 + random.uniform(1.5, 3.5))

    time.sleep(random.uniform(2, 4))

# converting to dataframe so that can save to .csv later
df = pd.DataFrame(all_posts)

# saving as .csv file type
df.to_csv("nairaland_bbnaija_2024.csv", index=False, encoding='utf-8-sig')
print("Saved to nairaland_bbnaija_2024.csv")
