In [36]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
from datetime import datetime

BASE_URL = "https://www.djelfa.info/vb/forumdisplay.php?f=2&page={}"
HEADERS = {"User-Agent": "Mozilla/5.0"}
CSV_PATH = "djelfa_forum_topics.csv"


def fetch_page(page):
    """Fetch one forum page."""
    url = BASE_URL.format(page)
    resp = requests.get(url, headers=HEADERS, timeout=10)
    if resp.status_code == 200:
        return resp.text
    print(f"[!] Failed to load page {page}, code: {resp.status_code}")
    return None


def parse_forum(html, page_number):
    """Extract forum topics from HTML content."""
    soup = BeautifulSoup(html, "html.parser")
    threads = soup.select("tbody#threadbits_forum_2 tr")
    data = []

    for tr in threads:
        title_elem = tr.select_one("a[id^='thread_title_']")
        if not title_elem:
            continue

        title = title_elem.text.strip()
        link = "https://www.djelfa.info/vb/" + title_elem["href"]

        author_elem = tr.select_one("td.alt2 a[href^='member.php']")
        author = author_elem.text.strip() if author_elem else "Unknown"

        reply_elem = tr.select("td.alt1[align='center']")
        replies = reply_elem[0].text.strip() if len(reply_elem) > 0 else "0"

        view_elem = tr.select("td.alt2[align='center']")
        views = view_elem[-1].text.strip() if view_elem else "0"

        last_post_info = tr.select_one("td.alt2[title]")
        last_post_text = last_post_info.get_text(strip=True).replace("\xa0", " ") if last_post_info else "N/A"

        data.append({
             "الصفحة": page_number,
             "العنوان": title,
             "الكاتب": author,
             "الردود": replies,
             "المشاهدات": views,
             "آخر_مشاركة": last_post_text,
             "الرابط": link
        })

    return data


def main():
    print("Starting Web Scraping from Djelfa Forum...")
    start_time = datetime.now()
    all_topics = []

    for page in range(1, 200):
        print(f"\nFetching page {page}...")
        html = fetch_page(page)
        if not html:
            continue

        # Steps 1: Display HTML source code (for requrement #1)
        if page == 1:
            print("\n--- HTML Preview (first 800 characters) ---\n")
            print(html[:800])
            print("\n--- End of HTML Preview ---\n")

        # Step 2: Extract data using BeautifulSoup (for requrement #2)
        topics = parse_forum(html, page)
        print(f"Extracted {len(topics)} topics from page {page}")
        all_topics.extend(topics)

        time.sleep(random.uniform(1.0, 2.0))

    # Step 3: Save results to CSV using pandas (for requrement #3)
    if all_topics:
        df = pd.DataFrame(all_topics)
        df.to_csv(CSV_PATH, index=False, encoding="utf-8-sig")
        print(f"\n Saved {len(df)} records to '{CSV_PATH}'")
        print("\nPreview of first 10 rows:\n")
        print(df.head(10).to_string(index=False))
    else:
        print(" erreur No data found. Please check site structure or selectors.")

    print(f"\n Duration: {(datetime.now() - start_time).total_seconds():.2f} seconds")


if __name__ == "__main__":
    main()


Starting Web Scraping from Djelfa Forum...

Fetching page 1...

--- HTML Preview (first 800 characters) ---

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="https://www.w3.org/1999/xhtml" dir="rtl" lang="ar">
<head>
<metaname="viewport"content="initial-scale=1, maximum-scale=1">
<meta https-equiv="Content-Type" content="text/html; charset=windows-1256" />
<meta name="generator" content="vBulletin 3.8.10 Release Candidate 2" />

<meta name="keywords" content="الجلفة للمواضيع العامّة, منتدى الجلفة ،منتديات الجلفة، منتدى الجزائر ، منتديات الجزائر، منتدى جزائري، منتديات جزائرية،  منتدى العرب، أولاد نائل،  مدن الجزائر، المنتدى السياسي، منتدى نصرة الرسول محمد (ص)، الحضارة و المدن الإسلامية، المقاومة و المقاطعة، منتدى البرامج، الحاسوب و الأنترنت، الموبايل و الإتصالات، المصممين و تطوير المواقع، خيمة الأدب و الش

--- End of HTML Preview ---

Extracted 30 topics from page 1

Fetching page 2...
Extracted 30 to