In [1]:


import os
import re
import time
import calendar
import json
import emoji
import emot
import pandas as pd

from datetime import datetime, timedelta
from tqdm import tqdm
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException

# Optional: load environment variables (not strictly needed here)
from dotenv import load_dotenv
load_dotenv()

# Load kaomoji mapping if present
KAOMOJI_FILE = "kaomoji_to_text.json"
if os.path.exists(KAOMOJI_FILE):
    with open(KAOMOJI_FILE, "r", encoding="utf-8") as f:
        kaomoji_to_text = json.load(f)
else:
    kaomoji_to_text = {}

# Ensure output directory exists
OUTPUT_DIR = "data/beyondblue_data"
os.makedirs(OUTPUT_DIR, exist_ok=True)


def parse_post_date(raw_date: str) -> str:
    """Convert relative or weekday dates into YYYY-MM-DD."""
    today = datetime.now()
    weekdays = [d.lower() for d in calendar.day_name]
    s = raw_date.strip().lower()

    if s in weekdays:
        delta = (today.weekday() - weekdays.index(s)) % 7
        return (today - timedelta(days=delta)).strftime("%Y-%m-%d")
    if "week" in s:
        n = int(re.search(r"(\d+)", s).group(1)) if re.search(r"\d+", s) else 1
        return (today - timedelta(weeks=n)).strftime("%Y-%m-%d")
    if "month" in s:
        n = int(re.search(r"(\d+)", s).group(1)) if re.search(r"\d+", s) else 1
        return (today - timedelta(days=30*n)).strftime("%Y-%m-%d")
    try:
        return datetime.strptime(raw_date, "%d-%m-%Y").strftime("%Y-%m-%d")
    except:
        return "Unknown"


def convert_emojis_emoticons(text: str) -> str:
    """Replace kaomoji, emoticons, and emojis with text labels."""
    for k, v in kaomoji_to_text.items():
        text = text.replace(k, f" {v} ")
    e = emot.core.emot()
    emo = e.emoticons(text)
    for orig, mean in zip(emo["value"], emo["mean"]):
        text = text.replace(orig, f" {mean} ")
    text = emoji.demojize(text)
    return text.strip().lower()


def comment_scraping(url: str, max_pages: int = 3, wait: int = 10) -> str:
    """Scrape up to `max_pages` of comments from a post page."""
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.set_page_load_timeout(30)
    comments = []
    try:
        page_url = url
        for _ in range(max_pages):
            try:
                driver.get(page_url)
                WebDriverWait(driver, wait).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "linear-message-list"))
                )
            except TimeoutException:
                break
            soup = BeautifulSoup(driver.page_source, "html.parser")
            section = soup.find("div", class_="linear-message-list")
            if not section:
                break
            for msg in section.find_all("div", recursive=False):
                txt = msg.get_text(" ", strip=True)
                comments.append(convert_emojis_emoticons(txt))
            nxt = soup.find("a", rel="next")
            if not nxt or not nxt.get("href"):
                break
            page_url = nxt["href"]
    finally:
        driver.quit()
    return " ||| ".join(comments)


def beyondblue_scraping(tag: str, start_url: str, pages: int = 20):
    """Scrape posts and comments for a given Beyond Blue forum tag."""
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.set_page_load_timeout(30)
    all_posts = []
    url = start_url
    try:
        for p in tqdm(range(1, pages + 1), desc=f"Scraping {tag}"):
            try:
                driver.get(url)
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "custom-message-list"))
                )
            except TimeoutException:
                continue
            soup = BeautifulSoup(driver.page_source, "html.parser")
            container = soup.find("div", class_="custom-message-list all-discussions")
            if not container:
                break
            for art in container.find_all("article"):
                aside = art.find("aside")
                cat_div = aside.find("div", class_="custom-tile-category-content") if aside else None
                post_cat = cat_div.find("a").text.strip() if cat_div and cat_div.find("a") else ""
                raw_date = cat_div.find("time").text.strip() if cat_div and cat_div.find("time") else ""
                date = parse_post_date(raw_date)
                link_el = art.find("h3").find_all("a")[1]
                post_link = link_el["href"]
                post_id = post_link.rstrip("/").split("/")[-1]
                full_link = "https://forums.beyondblue.org.au" + post_link
                title = convert_emojis_emoticons(link_el.text.strip())
                body = art.find("p", class_="body-text")
                content = convert_emojis_emoticons(body.text.strip()) if body else ""
                auth_div = aside.find("div", class_="custom-tile-author-info") if aside else None
                auth_a = auth_div.find("a") if auth_div else None
                author = auth_a.get_text(strip=True) if auth_a else ""
                uid = auth_a["href"].split("user-id/")[-1] if auth_a and "user-id/" in auth_a["href"] else ""
                rep = art.find("li", class_="custom-tile-replies")
                num_com = rep.find("b").text.strip() if rep and rep.find("b") else "0"
                comms = comment_scraping(full_link, max_pages=3)
                all_posts.append({
                    "Post ID": post_id,
                    "Post Title": title,
                    "Post Content": content,
                    "Post Author": author,
                    "User ID": uid,
                    "Post Date": date,
                    "Post Category": post_cat,
                    "Number of Comments": num_com,
                    "Comments": comms
                })
            nxt_li = soup.find("li", class_="lia-paging-page-next")
            if nxt_li and nxt_li.find("a"):
                url = nxt_li.find("a")["href"]
            else:
                break
            if p % 10 == 0:
                temp_df = pd.DataFrame(all_posts).drop_duplicates(subset="Post ID")
                temp_df.to_csv(f"{OUTPUT_DIR}/{tag}_page{p}.csv", index=False)
    finally:
        driver.quit()
    df = pd.DataFrame(all_posts).drop_duplicates(subset="Post ID")
    df.to_csv(f"{OUTPUT_DIR}/{tag}_beyondblue_posts.csv", index=False)
    print(f"Saved {len(df)} posts for {tag} to {OUTPUT_DIR}/{tag}_beyondblue_posts.csv")


if __name__ == "__main__":
    mental_health_urls = {
        "Sex_identity": "https://forums.beyondblue.org.au/t5/sexuality-and-gender-identity/bd-p/c1-sc4-b2?&sort=recent",
        "Multiculture":  "https://forums.beyondblue.org.au/t5/multicultural-experiences/bd-p/c1-sc4-b3?&sort=recent",
        "Grief_loss":    "https://forums.beyondblue.org.au/t5/grief-and-loss/bd-p/c1-sc4-b4?&sort=recent"
    }
    for tag, addr in mental_health_urls.items():
        try:
            beyondblue_scraping(tag, addr, pages=50)
        except Exception as e:
            print(f"Error scraping {tag}: {e}")
            continue


Scraping Sex_identity: 100%|██████████| 50/50 [1:32:16<00:00, 110.74s/it]


Saved 500 posts for Sex_identity to data/beyondblue_data/Sex_identity_beyondblue_posts.csv


Scraping Multiculture:  50%|█████     | 25/50 [47:52<47:52, 114.90s/it]  


Saved 256 posts for Multiculture to data/beyondblue_data/Multiculture_beyondblue_posts.csv


Scraping Grief_loss:  38%|███▊      | 19/50 [35:28<57:52, 112.00s/it]  


Error scraping Grief_loss: Message: unknown error: net::ERR_INTERNET_DISCONNECTED
  (Session info: chrome=138.0.7204.101)
Stacktrace:
	GetHandleVerifier [0x0xb844a3+62419]
	GetHandleVerifier [0x0xb844e4+62484]
	(No symbol) [0x0x9c2133]
	(No symbol) [0x0x9bf860]
	(No symbol) [0x0x9b30e2]
	(No symbol) [0x0x9b4b18]
	(No symbol) [0x0x9b3378]
	(No symbol) [0x0x9b2eb3]
	(No symbol) [0x0x9b2bc1]
	(No symbol) [0x0x9b0b64]
	(No symbol) [0x0x9b150b]
	(No symbol) [0x0x9c5b5e]
	(No symbol) [0x0xa51447]
	(No symbol) [0x0xa2f46c]
	(No symbol) [0x0xa5087a]
	(No symbol) [0x0xa2f266]
	(No symbol) [0x0x9fe852]
	(No symbol) [0x0x9ff6f4]
	GetHandleVerifier [0x0xdf4793+2619075]
	GetHandleVerifier [0x0xdefbaa+2599642]
	GetHandleVerifier [0x0xbab04a+221050]
	GetHandleVerifier [0x0xb9b2c8+156152]
	GetHandleVerifier [0x0xba1c7d+183213]
	GetHandleVerifier [0x0xb8c388+94904]
	GetHandleVerifier [0x0xb8c512+95298]
	GetHandleVerifier [0x0xb7766a+9626]
	BaseThreadInitThunk [0x0x74fd5d49+25]
	RtlInitializeExceptionCh

In [4]:
import os
import re
import time
import calendar
import json
import emoji
import emot
import pandas as pd
import hashlib
import unicodedata

from datetime import datetime, timedelta
from tqdm import tqdm
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Kaomoji mapping (if present)
KAOMOJI_FILE = "kaomoji_to_text.json"
if os.path.exists(KAOMOJI_FILE):
    with open(KAOMOJI_FILE, "r", encoding="utf-8") as f:
        kaomoji_to_text = json.load(f)
else:
    kaomoji_to_text = {}

OUTPUT_DIR = "Data"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def clean_text(s):
    """Remove unicode control characters, normalize whitespace, and strip."""
    if not s:
        return ""
    s = ''.join(c for c in s if unicodedata.category(c)[0] != 'C' and c != '\uFFFD')
    return s.replace('\xa0', ' ').replace('\u200e', '').strip()

def standardize_date(raw_date: str) -> str:
    """Attempt to parse and return date as 'YYYY-MM-DD HH:MM' or 'YYYY-MM-DD'."""
    s = clean_text(raw_date)
    today = datetime.now()
    weekdays = [d.lower() for d in calendar.day_name]
    s_lower = s.lower()

    # Handle relative dates
    if s_lower in weekdays:
        delta = (today.weekday() - weekdays.index(s_lower)) % 7
        return (today - timedelta(days=delta)).strftime("%Y-%m-%d")
    if "yesterday" in s_lower:
        return (today - timedelta(days=1)).strftime("%Y-%m-%d")
    if "today" in s_lower:
        return today.strftime("%Y-%m-%d")
    if "week" in s_lower:
        n = int(re.search(r"(\d+)", s_lower).group(1)) if re.search(r"\d+", s_lower) else 1
        return (today - timedelta(weeks=n)).strftime("%Y-%m-%d")
    if "month" in s_lower:
        n = int(re.search(r"(\d+)", s_lower).group(1)) if re.search(r"\d+", s_lower) else 1
        return (today - timedelta(days=30*n)).strftime("%Y-%m-%d")

    # Try various common formats
    fmts = [
        "%d-%m-%Y %I:%M %p", "%d-%m-%Y %H:%M", "%d-%m-%Y", "%Y-%m-%d",
        "%d/%m/%Y", "%d/%m/%Y %H:%M", "%Y-%m-%d %H:%M"
    ]
    for fmt in fmts:
        try:
            dt = datetime.strptime(s, fmt)
            return dt.strftime("%Y-%m-%d %H:%M")
        except Exception:
            continue
    # Try to extract date and time by regex
    m = re.match(r"(\d{2})-(\d{2})-(\d{4})\s*([0-9]{1,2}):([0-9]{2})", s)
    if m:
        return f"{m.group(3)}-{m.group(2)}-{m.group(1)} {m.group(4)}:{m.group(5)}"
    # Only date part
    m = re.match(r"(\d{2})-(\d{2})-(\d{4})", s)
    if m:
        return f"{m.group(3)}-{m.group(2)}-{m.group(1)}"
    # fallback: return cleaned string
    return s

def convert_emojis_emoticons(text: str) -> str:
    text = clean_text(text)
    for k, v in kaomoji_to_text.items():
        text = text.replace(k, f" {v} ")
    e = emot.core.emot()
    emo = e.emoticons(text)
    for orig, mean in zip(emo["value"], emo["mean"]):
        text = text.replace(orig, f" {mean} ")
    text = emoji.demojize(text)
    return text.strip().lower()

def get_existing_ids(filepath, id_column):
    if os.path.exists(filepath):
        try:
            df = pd.read_csv(filepath)
            return set(df[id_column].astype(str))
        except Exception:
            return set()
    else:
        return set()

def make_comment_id(msg, post_id, comment_content):
    """Use HTML id if available; otherwise, use a hash of content for stability."""
    comment_id = (msg.get('data-message-id') or msg.get('id') or '').strip()
    if comment_id and comment_id.lower() not in ['lineardisplaymessageviewwrapper', '']:
        return comment_id
    # Fallback: hash-based synthetic ID
    hash_part = hashlib.sha256(comment_content.encode('utf-8')).hexdigest()[:10]
    return f"{post_id}_c{hash_part}"

def extract_comment_date(msg):
    date_elem = msg.find("span", class_="local-friendly-date")
    if date_elem:
        if date_elem.has_attr('title') and date_elem['title'].strip():
            return standardize_date(date_elem['title'])
        elif date_elem.text.strip():
            return standardize_date(date_elem.text)
    datetime_elem = msg.find("span", class_="DateTime")
    if datetime_elem and datetime_elem.text.strip():
        return standardize_date(datetime_elem.text)
    if msg.has_attr('data-message-timestamp'):
        try:
            ts = int(msg['data-message-timestamp'])
            return datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M')
        except Exception:
            pass
    return ''

def comment_scraping(driver, post_url, post_id, category, max_comments=50, retry=3, polite_delay=1):
    comments = []
    comments_csv = os.path.join(OUTPUT_DIR, f"comments_{category}.csv")
    existing_comment_ids = get_existing_ids(comments_csv, "Comment ID")

    url = post_url
    scraped = 0
    total_comments = 0
    for page in range(1, 100):  # Arbitrary large page count, will break on next not found
        if scraped >= max_comments:
            break
        success = False
        for attempt in range(retry):
            try:
                driver.get(url)
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "linear-message-list"))
                )
                soup = BeautifulSoup(driver.page_source, "html.parser")
                success = True
                break
            except Exception:
                time.sleep(2)
        if not success:
            break

        section = soup.find("div", class_="linear-message-list")
        if not section:
            break

        msgs = section.find_all("div", recursive=False)
        if page == 1:
            header = soup.find("h2", class_="lia-message-thread-reply-count")
            if header:
                try:
                    total_comments = int(re.sub(r'[^\d]', '', header.text))
                except Exception:
                    total_comments = 0
            else:
                total_comments = len(msgs)

        for msg in msgs:
            if scraped >= max_comments:
                break
            content_elem = msg.find("div", class_="lia-message-body-content")
            comment_content = convert_emojis_emoticons(content_elem.get_text("\n", strip=True)) if content_elem else ""
            comment_id = make_comment_id(msg, post_id, comment_content)
            if comment_id in existing_comment_ids:
                continue

            author_elem = msg.find("a", class_="lia-user-name-link")
            comment_author = clean_text(author_elem.get_text(strip=True)) if author_elem else ""
            comment_date = extract_comment_date(msg)
            support_elem = msg.find("span", class_="lia-component-kudos-widget-message-kudos-count") \
                or msg.find("span", class_="kudos-count-link")
            comment_support = support_elem.text.strip() if support_elem else "0"
            comment_row = {
                "Comment ID": comment_id,
                "Post ID": post_id,
                "Category": category,
                "Comment Author": comment_author,
                "Comment Date": comment_date,
                "Comment Content": comment_content,
                "Comment Support": comment_support,
                "Post URL": post_url
            }
            comments.append(comment_row)
            existing_comment_ids.add(comment_id)  # Deduplicate within run
            scraped += 1
        # Pagination: find next page for comments
        nxt = soup.find("a", rel="next")
        if not nxt or not nxt.get("href"):
            break
        url = "https://forums.beyondblue.org.au" + nxt["href"] if nxt["href"].startswith("/") else nxt["href"]
        time.sleep(polite_delay)
    # Sort comments by date before returning (if possible)
    comments = sorted(comments, key=lambda c: c["Comment Date"])
    return comments, total_comments

def beyondblue_scraping(tag: str, start_url: str, pages: int = 20, polite_delay=2):
    posts_csv = os.path.join(OUTPUT_DIR, f"posts_{tag}.csv")
    comments_csv = os.path.join(OUTPUT_DIR, f"comments_{tag}.csv")
    existing_post_ids = get_existing_ids(posts_csv, "Post ID")
    existing_comment_ids = get_existing_ids(comments_csv, "Comment ID")
    all_posts = []
    all_comments = []

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.set_page_load_timeout(30)
    url = start_url
    try:
        for p in tqdm(range(1, pages + 1), desc=f"Scraping {tag}"):
            for attempt in range(3):
                try:
                    driver.get(url)
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.CLASS_NAME, "custom-message-list"))
                    )
                    soup = BeautifulSoup(driver.page_source, "html.parser")
                    break
                except Exception:
                    if attempt == 2:
                        print(f"Failed to load page {url}")
                        return
                    time.sleep(2)
            container = soup.find("div", class_="custom-message-list all-discussions")
            if not container:
                break
            for art in container.find_all("article"):
                aside = art.find("aside")
                cat_div = aside.find("div", class_="custom-tile-category-content") if aside else None
                post_cat = clean_text(cat_div.find("a").text.strip()) if cat_div and cat_div.find("a") else tag
                raw_date = cat_div.find("time").text.strip() if cat_div and cat_div.find("time") else ""
                date = standardize_date(raw_date)
                h3 = art.find("h3")
                link_els = h3.find_all("a") if h3 else []
                link_el = link_els[1] if len(link_els) > 1 else (link_els[0] if link_els else None)
                post_link = link_el["href"] if link_el and link_el.has_attr("href") else ""
                post_id = post_link.rstrip("/").split("/")[-1] if post_link else ""
                if not post_id or post_id in existing_post_ids:
                    continue
                full_link = "https://forums.beyondblue.org.au" + post_link if post_link.startswith("/") else post_link
                title = convert_emojis_emoticons(link_el.text.strip()) if link_el else ""
                body = art.find("p", class_="body-text")
                content = convert_emojis_emoticons(body.text.strip()) if body else ""
                auth_div = aside.find("div", class_="custom-tile-author-info") if aside else None
                auth_a = auth_div.find("a") if auth_div else None
                author = clean_text(auth_a.get_text(strip=True)) if auth_a else ""
                rep = art.find("li", class_="custom-tile-replies")
                num_com = rep.find("b").text.strip() if rep and rep.find("b") else "0"
                post_support = "0"

                post_content = ""
                for attempt in range(3):
                    try:
                        driver.execute_script("window.open('');")
                        driver.switch_to.window(driver.window_handles[1])
                        driver.get(full_link)
                        WebDriverWait(driver, 10).until(
                            EC.presence_of_element_located((By.CLASS_NAME, "lia-message-body-content"))
                        )
                        post_soup = BeautifulSoup(driver.page_source, "html.parser")
                        content_elems = post_soup.find_all("div", class_="lia-message-body-content")
                        post_content = "\n".join([convert_emojis_emoticons(elem.get_text("\n", strip=True)) for elem in content_elems if elem.get_text(strip=True)])
                        # Extract support count from the full post page
                        support_span = post_soup.find("span", class_="lia-component-kudos-widget-message-kudos-count") \
                            or post_soup.find("span", class_="kudos-count-link")
                        if support_span:
                            post_support = support_span.text.strip()
                        else:
                            post_support = "0"
                        comments, total_comment_count = comment_scraping(driver, full_link, post_id, tag, max_comments=50)
                        # Deduplicate comments (skip already scraped)
                        new_comments = []
                        for c in comments:
                            if c["Comment ID"] not in existing_comment_ids:
                                new_comments.append(c)
                                existing_comment_ids.add(c["Comment ID"])
                        all_comments.extend(new_comments)
                        driver.close()
                        driver.switch_to.window(driver.window_handles[0])
                        break
                    except Exception as e:
                        if attempt == 2:
                            print(f"Failed to extract post page {full_link}: {e}")
                            try:
                                driver.close()
                                driver.switch_to.window(driver.window_handles[0])
                            except: pass
                            post_content = content
                            total_comment_count = num_com
                            post_support = "0"
                        else:
                            time.sleep(2)
                post_data = {
                    "Post ID": post_id,
                    "Category": post_cat,
                    "Post Title": title,
                    "Post Author": author,
                    "Post Date": date,
                    "Post Content": post_content,
                    "Support Count": post_support,
                    "Total Number of Comments": total_comment_count,
                    "Post URL": full_link
                }
                all_posts.append(post_data)
                existing_post_ids.add(post_id)
                time.sleep(polite_delay)
            nxt_li = soup.find("li", class_="lia-paging-page-next")
            if nxt_li and nxt_li.find("a"):
                next_href = nxt_li.find("a")["href"]
                url = "https://forums.beyondblue.org.au" + next_href if next_href.startswith("/") else next_href
            else:
                break
            if p % 5 == 0:
                dfp = pd.DataFrame(all_posts)
                dfp.sort_values(by="Post Date", inplace=True)
                dfp.to_csv(posts_csv, index=False)
                dfc = pd.DataFrame(all_comments)
                dfc.sort_values(by="Comment Date", inplace=True)
                dfc.to_csv(comments_csv, index=False)
    finally:
        driver.quit()
    # Final save, sorted by date
    if all_posts:
        dfp = pd.DataFrame(all_posts)
        if os.path.exists(posts_csv):
            dfp_existing = pd.read_csv(posts_csv)
            dfp = pd.concat([dfp_existing, dfp], ignore_index=True)
            dfp.drop_duplicates(subset=["Post ID"], inplace=True)
        dfp.sort_values(by="Post Date", inplace=True)
        dfp.to_csv(posts_csv, index=False)
        print(f"Saved {len(dfp)} posts to {posts_csv}")
    if all_comments:
        dfc = pd.DataFrame(all_comments)
        if os.path.exists(comments_csv):
            dfc_existing = pd.read_csv(comments_csv)
            dfc = pd.concat([dfc_existing, dfc], ignore_index=True)
            dfc.drop_duplicates(subset=["Comment ID"], inplace=True)
        dfc.sort_values(by="Comment Date", inplace=True)
        dfc.to_csv(comments_csv, index=False)
        print(f"Saved {len(dfc)} comments to {comments_csv}")

if __name__ == "__main__":
    mental_health_urls = {
        "anxiety":      "https://forums.beyondblue.org.au/t5/anxiety/bd-p/c1-sc2-b1?&sort=recent",
        "depression":   "https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2?&sort=recent",
        "ptsd_trauma":  "https://forums.beyondblue.org.au/t5/ptsd-and-trauma/bd-p/c1-sc2-b3?&sort=recent",
        "suicidal_selfharm": "https://forums.beyondblue.org.au/t5/suicidal-thoughts-and-self-harm/bd-p/c1-sc2-b4?&sort=recent",
        "staying_well": "https://forums.beyondblue.org.au/t5/staying-well/bd-p/c1-sc3-b1?&sort=recent",
        "treatments":   "https://forums.beyondblue.org.au/t5/treatments-health-professionals/bd-p/c1-sc3-b2?&sort=recent",
        "relationships":"https://forums.beyondblue.org.au/t5/relationship-and-family-issues/bd-p/c1-sc3-b3?&sort=recent",
        "supporting_friends": "https://forums.beyondblue.org.au/t5/supporting-family-and-friends/bd-p/c1-sc3-b4?&sort=recent",
        "long_term_support":  "https://forums.beyondblue.org.au/t5/long-term-support-over-the/bd-p/c1-sc3-b5?&sort=recent",
        "young_people": "https://forums.beyondblue.org.au/t5/young-people/bd-p/c1-sc4-b1?&sort=recent",
        "Sex_identity": "https://forums.beyondblue.org.au/t5/sexuality-and-gender-identity/bd-p/c1-sc4-b2?&sort=recent",
        "Multiculture":  "https://forums.beyondblue.org.au/t5/multicultural-experiences/bd-p/c1-sc4-b3?&sort=recent",
        "Grief_loss":    "https://forums.beyondblue.org.au/t5/grief-and-loss/bd-p/c1-sc4-b4?&sort=recent"
    }
    for tag, addr in mental_health_urls.items():
        try:
            beyondblue_scraping(tag, addr, pages=100)
        except Exception as e:
            print(f"Error scraping {tag}: {e}")
            continue

Scraping anxiety: 100%|██████████| 100/100 [44:24<00:00, 26.64s/it] 


Saved 904 posts to Data\posts_anxiety.csv
Saved 452 comments to Data\comments_anxiety.csv


Scraping depression:  23%|██▎       | 23/100 [22:32<1:15:26, 58.78s/it]


KeyboardInterrupt: 

In [9]:
import os
import re
import time
import calendar
import json
import emoji
import emot
import pandas as pd
import hashlib
import unicodedata

from datetime import datetime, timedelta
from tqdm import tqdm
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

KAOMOJI_FILE = "kaomoji_to_text.json"
if os.path.exists(KAOMOJI_FILE):
    with open(KAOMOJI_FILE, "r", encoding="utf-8") as f:
        kaomoji_to_text = json.load(f)
else:
    kaomoji_to_text = {}

OUTPUT_DIR = "Data"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def clean_text(s):
    if not s:
        return ""
    s = ''.join(c for c in s if unicodedata.category(c)[0] != 'C' and c != '\uFFFD')
    return s.replace('\xa0', ' ').replace('\u200e', '').strip()

def standardize_date(raw_date: str) -> str:
    s = clean_text(raw_date)
    today = datetime.now()
    weekdays = [d.lower() for d in calendar.day_name]
    s_lower = s.lower()

    # Handle relative dates
    if s_lower in weekdays:
        delta = (today.weekday() - weekdays.index(s_lower)) % 7
        return (today - timedelta(days=delta)).strftime("%d-%m-%Y")
    if "yesterday" in s_lower:
        return (today - timedelta(days=1)).strftime("%d-%m-%Y")
    if "today" in s_lower:
        return today.strftime("%d-%m-%Y")
    if "week" in s_lower:
        n = int(re.search(r"(\d+)", s_lower).group(1)) if re.search(r"\d+", s_lower) else 1
        return (today - timedelta(weeks=n)).strftime("%d-%m-%Y")
    if "month" in s_lower:
        n = int(re.search(r"(\d+)", s_lower).group(1)) if re.search(r"\d+", s_lower) else 1
        return (today - timedelta(days=30*n)).strftime("%d-%m-%Y")

    # Try most relevant date formats
    fmts = [
        "%Y-%m-%dT%H:%M:%SZ",
        "%Y-%m-%dT%H:%M:%S",
        "%Y-%m-%d %H:%M",
        "%d-%m-%Y %H:%M",
        "%d-%m-%Y %I:%M %p",
        "%d/%m/%Y %H:%M",
        "%d/%m/%Y %I:%M %p",
        "%d-%m-%Y%H:%M",
        "%Y-%m-%d%H:%M",
        "%d/%m/%Y%H:%M",
    ]
    for fmt in fmts:
        try:
            dt = datetime.strptime(s, fmt)
            return dt.strftime("%d-%m-%Y")
        except Exception:
            continue
    # Date only formats (no time info)
    date_fmts = [
        "%d-%m-%Y", "%Y-%m-%d", "%d/%m/%Y"
    ]
    for fmt in date_fmts:
        try:
            dt = datetime.strptime(s, fmt)
            return dt.strftime("%d-%m-%Y")
        except Exception:
            continue
    # Fallback: regex for non-standard
    m = re.match(r"(\d{2})-(\d{2})-(\d{4})", s)
    if m:
        return f"{m.group(1)}-{m.group(2)}-{m.group(3)}"
    return s

def convert_emojis_emoticons(text: str) -> str:
    text = clean_text(text)
    for k, v in kaomoji_to_text.items():
        text = text.replace(k, f" {v} ")
    e = emot.core.emot()
    emo = e.emoticons(text)
    for orig, mean in zip(emo["value"], emo["mean"]):
        text = text.replace(orig, f" {mean} ")
    text = emoji.demojize(text)
    return text.strip().lower()

def get_existing_ids(filepath, id_column):
    if os.path.exists(filepath):
        try:
            df = pd.read_csv(filepath)
            return set(df[id_column].astype(str))
        except Exception:
            return set()
    else:
        return set()

def make_comment_id(msg, post_id, comment_content):
    comment_id = (msg.get('data-message-id') or msg.get('id') or '').strip()
    if comment_id and comment_id.lower() not in ['lineardisplaymessageviewwrapper', '']:
        return comment_id
    hash_part = hashlib.sha256(comment_content.encode('utf-8')).hexdigest()[:10]
    return f"{post_id}_c{hash_part}"

def extract_comment_date(msg):
    time_elem = msg.find("time")
    if time_elem:
        if time_elem.has_attr('datetime'):
            return standardize_date(time_elem['datetime'])
        elif time_elem.has_attr('title'):
            return standardize_date(time_elem['title'])
        elif time_elem.text.strip():
            return standardize_date(time_elem.text)
    date_elem = msg.find("span", class_="local-friendly-date")
    if date_elem:
        if date_elem.has_attr('title') and date_elem['title'].strip():
            return standardize_date(date_elem['title'])
        elif date_elem.text.strip():
            return standardize_date(date_elem.text)
    datetime_elem = msg.find("span", class_="DateTime")
    if datetime_elem and datetime_elem.text.strip():
        return standardize_date(datetime_elem.text)
    if msg.has_attr('data-message-timestamp'):
        try:
            ts = int(msg['data-message-timestamp'])
            return datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M')
        except Exception:
            pass
    return ''

def comment_scraping(driver, post_url, post_id, category, max_comments=200, retry=3, polite_delay=1):
    comments = []
    comments_csv = os.path.join(OUTPUT_DIR, f"comments_{category}.csv")
    existing_comment_ids = get_existing_ids(comments_csv, "Comment ID")
    url = post_url
    scraped = 0
    for page in range(1, 100):  # will break on next not found
        if scraped >= max_comments:
            break
        success = False
        for attempt in range(retry):
            try:
                driver.get(url)
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "linear-message-list"))
                )
                soup = BeautifulSoup(driver.page_source, "html.parser")
                success = True
                break
            except Exception:
                time.sleep(2)
        if not success:
            break

        section = soup.find("div", class_="linear-message-list")
        if not section:
            break

        msgs = section.find_all("div", class_="lia-message-view-display")
        for msg in msgs:
            if scraped >= max_comments:
                break
            content_elem = msg.find("div", class_="lia-message-body-content")
            comment_content = convert_emojis_emoticons(content_elem.get_text("\n", strip=True)) if content_elem else ""
            comment_id = make_comment_id(msg, post_id, comment_content)
            if comment_id in existing_comment_ids:
                continue
            author_elem = msg.find("a", class_="lia-user-name-link")
            comment_author = clean_text(author_elem.get_text(strip=True)) if author_elem else ""
            comment_date = extract_comment_date(msg)
            support_elem = msg.find("span", {"id": re.compile(r"^kudos-count-")})
            if not support_elem:
                support_elem = msg.find("span", class_="lia-component-kudos-widget-message-kudos-count")
            comment_support = support_elem.text.strip() if support_elem else "0"
            comment_row = {
                "Comment ID": comment_id,
                "Post ID": post_id,
                "Category": category,
                "Comment Author": comment_author,
                "Comment Date": comment_date,
                "Comment Content": comment_content,
                "Comment Support": comment_support,
                "Post URL": post_url
            }
            comments.append(comment_row)
            existing_comment_ids.add(comment_id)
            scraped += 1
        nxt = soup.find("a", rel="next")
        if not nxt or not nxt.get("href"):
            break
        url = "https://forums.beyondblue.org.au" + nxt["href"] if nxt["href"].startswith("/") else nxt["href"]
        time.sleep(polite_delay)
    comments = sorted(comments, key=lambda c: c["Comment Date"])
    return comments

def beyondblue_scraping(tag: str, start_url: str, pages: int = 20, polite_delay=2):
    posts_csv = os.path.join(OUTPUT_DIR, f"posts_{tag}.csv")
    comments_csv = os.path.join(OUTPUT_DIR, f"comments_{tag}.csv")
    existing_post_ids = get_existing_ids(posts_csv, "Post ID")
    existing_comment_ids = get_existing_ids(comments_csv, "Comment ID")
    all_posts = []
    all_comments = []

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.set_page_load_timeout(30)
    url = start_url
    try:
        for p in tqdm(range(1, pages + 1), desc=f"Scraping {tag}"):
            for attempt in range(3):
                try:
                    driver.get(url)
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.CLASS_NAME, "custom-message-list"))
                    )
                    soup = BeautifulSoup(driver.page_source, "html.parser")
                    break
                except Exception:
                    if attempt == 2:
                        print(f"Failed to load page {url}")
                        return
                    time.sleep(2)
            container = soup.find("div", class_="custom-message-list all-discussions")
            if not container:
                break
            for art in container.find_all("article"):
                aside = art.find("aside")
                cat_div = aside.find("div", class_="custom-tile-category-content") if aside else None
                post_cat = clean_text(cat_div.find("a").text.strip()) if cat_div and cat_div.find("a") else tag
                # --- DATE ---
                time_el = cat_div.find("time") if cat_div else None
                if time_el and time_el.has_attr('datetime'):
                    raw_date = time_el['datetime']
                elif time_el and time_el.has_attr('title'):
                    raw_date = time_el['title']
                elif time_el:
                    raw_date = time_el.text.strip()
                else:
                    raw_date = ""
                date = standardize_date(raw_date)
                # --- SUPPORT AND REPLIES ---
                support_li = art.find("li", class_="custom-tile-kudos")
                support_span = support_li.find("span") if support_li else None
                post_support = support_span.text.strip() if support_span else "0"

                replies_li = art.find("li", class_="custom-tile-replies")
                replies_b = replies_li.find("b") if replies_li else None
                total_comment_count = replies_b.text.strip() if replies_b else "0"
                # --- POST TITLE/URL/CONTENT ---
                h3 = art.find("h3")
                link_els = h3.find_all("a") if h3 else []
                link_el = link_els[1] if len(link_els) > 1 else (link_els[0] if link_els else None)
                post_link = link_el["href"] if link_el and link_el.has_attr("href") else ""
                post_id = post_link.rstrip("/").split("/")[-1] if post_link else ""
                if not post_id or post_id in existing_post_ids:
                    continue
                full_link = "https://forums.beyondblue.org.au" + post_link if post_link.startswith("/") else post_link
                title = convert_emojis_emoticons(link_el.text.strip()) if link_el else ""
                body = art.find("p", class_="body-text")
                content = convert_emojis_emoticons(body.text.strip()) if body else ""
                auth_div = aside.find("div", class_="custom-tile-author-info") if aside else None
                auth_a = auth_div.find("a") if auth_div else None
                author = clean_text(auth_a.get_text(strip=True)) if auth_a else ""
                post_content = content
                # --- Get real post support from detail page ---
                for attempt in range(3):
                    try:
                        driver.execute_script("window.open('');")
                        driver.switch_to.window(driver.window_handles[1])
                        driver.get(full_link)
                        WebDriverWait(driver, 10).until(
                            EC.presence_of_element_located((By.CLASS_NAME, "lia-message-body-content"))
                        )
                        post_soup = BeautifulSoup(driver.page_source, "html.parser")
                        content_elems = post_soup.find_all("div", class_="lia-message-body-content")
                        if content_elems:
                            post_content = "\n".join([convert_emojis_emoticons(elem.get_text("\n", strip=True)) for elem in content_elems if elem.get_text(strip=True)])
                        # Get support count from first post in thread
                        support_span_detail = post_soup.find("span", {"id": re.compile(r"^kudos-count-")})
                        if not support_span_detail:
                            support_span_detail = post_soup.find("span", class_="lia-component-kudos-widget-message-kudos-count")
                        if support_span_detail:
                            post_support = support_span_detail.text.strip()
                        # Get post date from detail if missing
                        time_elem = post_soup.find("time")
                        if not date and time_elem:
                            if time_elem.has_attr("datetime"):
                                date = standardize_date(time_elem["datetime"])
                            elif time_elem.has_attr("title"):
                                date = standardize_date(time_elem["title"])
                            else:
                                date = standardize_date(time_elem.text.strip())
                        # Scrape all comments (not for count, but real data)
                        comments = comment_scraping(driver, full_link, post_id, tag, max_comments=200)
                        new_comments = []
                        for c in comments:
                            if c["Comment ID"] not in existing_comment_ids:
                                new_comments.append(c)
                                existing_comment_ids.add(c["Comment ID"])
                        all_comments.extend(new_comments)
                        driver.close()
                        driver.switch_to.window(driver.window_handles[0])
                        break
                    except Exception as e:
                        if attempt == 2:
                            print(f"Failed to extract post page {full_link}: {e}")
                            try:
                                driver.close()
                                driver.switch_to.window(driver.window_handles[0])
                            except: pass
                        else:
                            time.sleep(2)
                post_data = {
                    "Post ID": post_id,
                    "Category": post_cat,
                    "Post Title": title,
                    "Post Author": author,
                    "Post Date": date,
                    "Post Content": post_content,
                    "Support Count": post_support,
                    "Total Number of Comments": total_comment_count,
                    "Post URL": full_link
                }
                all_posts.append(post_data)
                existing_post_ids.add(post_id)
                time.sleep(polite_delay)
            nxt_li = soup.find("li", class_="lia-paging-page-next")
            if nxt_li and nxt_li.find("a"):
                next_href = nxt_li.find("a")["href"]
                url = "https://forums.beyondblue.org.au" + next_href if next_href.startswith("/") else next_href
            else:
                break
            if p % 5 == 0:
                dfp = pd.DataFrame(all_posts)
                dfp.sort_values(by="Post Date", inplace=True)
                dfp.to_csv(posts_csv, index=False)
                dfc = pd.DataFrame(all_comments)
                dfc.sort_values(by="Comment Date", inplace=True)
                dfc.to_csv(comments_csv, index=False)
    finally:
        driver.quit()
    if all_posts:
        dfp = pd.DataFrame(all_posts)
        if os.path.exists(posts_csv):
            dfp_existing = pd.read_csv(posts_csv)
            dfp = pd.concat([dfp_existing, dfp], ignore_index=True)
            dfp.drop_duplicates(subset=["Post ID"], inplace=True)
        dfp.sort_values(by="Post Date", inplace=True)
        dfp.to_csv(posts_csv, index=False)
        print(f"Saved {len(dfp)} posts to {posts_csv}")
    if all_comments:
        dfc = pd.DataFrame(all_comments)
        if os.path.exists(comments_csv):
            dfc_existing = pd.read_csv(comments_csv)
            dfc = pd.concat([dfc_existing, dfc], ignore_index=True)
            dfc.drop_duplicates(subset=["Comment ID"], inplace=True)
        dfc.sort_values(by="Comment Date", inplace=True)
        dfc.to_csv(comments_csv, index=False)
        print(f"Saved {len(dfc)} comments to {comments_csv}")

if __name__ == "__main__":
    mental_health_urls = {
        "anxiety":      "https://forums.beyondblue.org.au/t5/anxiety/bd-p/c1-sc2-b1?&sort=recent",
        "depression":   "https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2?&sort=recent",
        "ptsd_trauma":  "https://forums.beyondblue.org.au/t5/ptsd-and-trauma/bd-p/c1-sc2-b3?&sort=recent",
        "suicidal_selfharm": "https://forums.beyondblue.org.au/t5/suicidal-thoughts-and-self-harm/bd-p/c1-sc2-b4?&sort=recent",
        "staying_well": "https://forums.beyondblue.org.au/t5/staying-well/bd-p/c1-sc3-b1?&sort=recent",
        "treatments":   "https://forums.beyondblue.org.au/t5/treatments-health-professionals/bd-p/c1-sc3-b2?&sort=recent",
        "relationships":"https://forums.beyondblue.org.au/t5/relationship-and-family-issues/bd-p/c1-sc3-b3?&sort=recent",
        "supporting_friends": "https://forums.beyondblue.org.au/t5/supporting-family-and-friends/bd-p/c1-sc3-b4?&sort=recent",
        "long_term_support":  "https://forums.beyondblue.org.au/t5/long-term-support-over-the/bd-p/c1-sc3-b5?&sort=recent",
        "young_people": "https://forums.beyondblue.org.au/t5/young-people/bd-p/c1-sc4-b1?&sort=recent",
        "Sex_identity": "https://forums.beyondblue.org.au/t5/sexuality-and-gender-identity/bd-p/c1-sc4-b2?&sort=recent",
        "Multiculture":  "https://forums.beyondblue.org.au/t5/multicultural-experiences/bd-p/c1-sc4-b3?&sort=recent",
        "Grief_loss":    "https://forums.beyondblue.org.au/t5/grief-and-loss/bd-p/c1-sc4-b4?&sort=recent"
    }
    for tag, addr in mental_health_urls.items():
        try:
            beyondblue_scraping(tag, addr, pages=100)
        except Exception as e:
            print(f"Error scraping {tag}: {e}")
            continue

Scraping anxiety: 100%|██████████| 100/100 [1:37:48<00:00, 58.69s/it]


Saved 1996 posts to Data\posts_anxiety.csv
Saved 998 comments to Data\comments_anxiety.csv
Error scraping depression: Message: session not created: probably user data directory is already in use, please specify a unique value for --user-data-dir argument, or don't use --user-data-dir; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#sessionnotcreatedexception
Stacktrace:
	GetHandleVerifier [0x0x1271a33+62339]
	GetHandleVerifier [0x0x1271a74+62404]
	(No symbol) [0x0x10b2123]
	(No symbol) [0x0x10e58b8]
	(No symbol) [0x0x10e11a9]
	(No symbol) [0x0x112ae77]
	(No symbol) [0x0x112a76a]
	(No symbol) [0x0x111f1b6]
	(No symbol) [0x0x10ee7a2]
	(No symbol) [0x0x10ef644]
	GetHandleVerifier [0x0x14e65c3+2637587]
	GetHandleVerifier [0x0x14e19ca+2618138]
	GetHandleVerifier [0x0x12984aa+220666]
	GetHandleVerifier [0x0x12888d8+156200]
	GetHandleVerifier [0x0x128f06d+182717]
	GetHandleVerifier [0x0x1279978+94920]
	GetHandleVerifier [0

Scraping ptsd_trauma: 100%|██████████| 100/100 [1:54:51<00:00, 68.92s/it] 


Saved 2000 posts to Data\posts_ptsd_trauma.csv
Saved 1000 comments to Data\comments_ptsd_trauma.csv


Scraping suicidal_selfharm: 100%|██████████| 100/100 [1:59:52<00:00, 71.93s/it]  


Saved 2000 posts to Data\posts_suicidal_selfharm.csv
Saved 1000 comments to Data\comments_suicidal_selfharm.csv


Scraping staying_well: 100%|██████████| 100/100 [3:08:35<00:00, 113.16s/it] 


Saved 2000 posts to Data\posts_staying_well.csv
Saved 1000 comments to Data\comments_staying_well.csv


Scraping treatments: 100%|██████████| 100/100 [1:35:54<00:00, 57.55s/it]


Saved 2000 posts to Data\posts_treatments.csv
Saved 1000 comments to Data\comments_treatments.csv


Scraping relationships: 100%|██████████| 100/100 [1:34:05<00:00, 56.46s/it]


Saved 2000 posts to Data\posts_relationships.csv
Saved 1000 comments to Data\comments_relationships.csv


Scraping supporting_friends: 100%|██████████| 100/100 [1:37:48<00:00, 58.69s/it]


Saved 2000 posts to Data\posts_supporting_friends.csv
Saved 1000 comments to Data\comments_supporting_friends.csv


Scraping long_term_support:   9%|▉         | 9/100 [1:58:51<20:01:52, 792.44s/it]


KeyboardInterrupt: 

In [10]:
import os
import re
import time
import calendar
import json
import emoji
import emot
import pandas as pd
import hashlib
import unicodedata

from datetime import datetime, timedelta
from tqdm import tqdm
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

KAOMOJI_FILE = "kaomoji_to_text.json"
if os.path.exists(KAOMOJI_FILE):
    with open(KAOMOJI_FILE, "r", encoding="utf-8") as f:
        kaomoji_to_text = json.load(f)
else:
    kaomoji_to_text = {}

OUTPUT_DIR = "Data"
os.makedirs(OUTPUT_DIR, exist_ok=True)


def clean_text(s):
    if not s:
        return ""
    s = ''.join(c for c in s if unicodedata.category(c)[0] != 'C' and c != '\uFFFD')
    return s.replace('\xa0', ' ').replace('\u200e', '').strip()


def standardize_date(raw_date: str) -> str:
    s = clean_text(raw_date)
    today = datetime.now()
    weekdays = [d.lower() for d in calendar.day_name]
    s_lower = s.lower()

    # Handle relative dates
    if s_lower in weekdays:
        delta = (today.weekday() - weekdays.index(s_lower)) % 7
        return (today - timedelta(days=delta)).strftime("%d-%m-%Y")
    if "yesterday" in s_lower:
        return (today - timedelta(days=1)).strftime("%d-%m-%Y")
    if "today" in s_lower:
        return today.strftime("%d-%m-%Y")
    if "week" in s_lower:
        n = int(re.search(r"(\d+)", s_lower).group(1)) if re.search(r"\d+", s_lower) else 1
        return (today - timedelta(weeks=n)).strftime("%d-%m-%Y")
    if "month" in s_lower:
        n = int(re.search(r"(\d+)", s_lower).group(1)) if re.search(r"\d+", s_lower) else 1
        return (today - timedelta(days=30 * n)).strftime("%d-%m-%Y")

    # Try most relevant date formats
    fmts = [
        "%Y-%m-%dT%H:%M:%SZ",
        "%Y-%m-%dT%H:%M:%S",
        "%Y-%m-%d %H:%M",
        "%d-%m-%Y %H:%M",
        "%d-%m-%Y %I:%M %p",
        "%d/%m/%Y %H:%M",
        "%d/%m/%Y %I:%M %p",
        "%d-%m-%Y%H:%M",
        "%Y-%m-%d%H:%M",
        "%d/%m/%Y%H:%M",
    ]
    for fmt in fmts:
        try:
            dt = datetime.strptime(s, fmt)
            return dt.strftime("%d-%m-%Y")
        except Exception:
            continue
    # Date only formats (no time info)
    date_fmts = [
        "%d-%m-%Y", "%Y-%m-%d", "%d/%m/%Y"
    ]
    for fmt in date_fmts:
        try:
            dt = datetime.strptime(s, fmt)
            return dt.strftime("%d-%m-%Y")
        except Exception:
            continue
    # Fallback: regex for non-standard
    m = re.match(r"(\d{2})-(\d{2})-(\d{4})", s)
    if m:
        return f"{m.group(1)}-{m.group(2)}-{m.group(3)}"
    return s


def convert_emojis_emoticons(text: str) -> str:
    text = clean_text(text)
    for k, v in kaomoji_to_text.items():
        text = text.replace(k, f" {v} ")
    e = emot.core.emot()
    emo = e.emoticons(text)
    for orig, mean in zip(emo["value"], emo["mean"]):
        text = text.replace(orig, f" {mean} ")
    text = emoji.demojize(text)
    return text.strip().lower()


def get_existing_ids(filepath, id_column):
    if os.path.exists(filepath):
        try:
            df = pd.read_csv(filepath)
            return set(df[id_column].astype(str))
        except Exception:
            return set()
    else:
        return set()


def make_comment_id(msg, post_id, comment_content):
    comment_id = (msg.get('data-message-id') or msg.get('id') or '').strip()
    if comment_id and comment_id.lower() not in ['lineardisplaymessageviewwrapper', '']:
        return comment_id
    hash_part = hashlib.sha256(comment_content.encode('utf-8')).hexdigest()[:10]
    return f"{post_id}_c{hash_part}"


def extract_comment_date(msg):
    time_elem = msg.find("time")
    if time_elem:
        if time_elem.has_attr('datetime'):
            return standardize_date(time_elem['datetime'])
        elif time_elem.has_attr('title'):
            return standardize_date(time_elem['title'])
        elif time_elem.text.strip():
            return standardize_date(time_elem.text)
    date_elem = msg.find("span", class_="local-friendly-date")
    if date_elem:
        if date_elem.has_attr('title') and date_elem['title'].strip():
            return standardize_date(date_elem['title'])
        elif date_elem.text.strip():
            return standardize_date(date_elem.text)
    datetime_elem = msg.find("span", class_="DateTime")
    if datetime_elem and datetime_elem.text.strip():
        return standardize_date(datetime_elem.text)
    if msg.has_attr('data-message-timestamp'):
        try:
            ts = int(msg['data-message-timestamp'])
            return datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M')
        except Exception:
            pass
    return ''


def comment_scraping(driver, post_url, post_id, category, max_comments=200, retry=3, polite_delay=1):
    comments = []
    comments_csv = os.path.join(OUTPUT_DIR, f"comments_{category}.csv")
    existing_comment_ids = get_existing_ids(comments_csv, "Comment ID")
    url = post_url
    scraped = 0
    for page in range(1, 100):  # will break on next not found
        if scraped >= max_comments:
            break
        success = False
        for attempt in range(retry):
            try:
                driver.get(url)
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "linear-message-list"))
                )
                soup = BeautifulSoup(driver.page_source, "html.parser")
                success = True
                break
            except Exception:
                time.sleep(2)
        if not success:
            break

        section = soup.find("div", class_="linear-message-list")
        if not section:
            break

        msgs = section.find_all("div", class_="lia-message-view-display")
        for msg in msgs:
            if scraped >= max_comments:
                break
            content_elem = msg.find("div", class_="lia-message-body-content")
            comment_content = convert_emojis_emoticons(content_elem.get_text("\n", strip=True)) if content_elem else ""
            comment_id = make_comment_id(msg, post_id, comment_content)
            if comment_id in existing_comment_ids:
                continue
            author_elem = msg.find("a", class_="lia-user-name-link")
            comment_author = clean_text(author_elem.get_text(strip=True)) if author_elem else ""
            comment_date = extract_comment_date(msg)
            support_elem = msg.find("span", {"id": re.compile(r"^kudos-count-")})
            if not support_elem:
                support_elem = msg.find("span", class_="lia-component-kudos-widget-message-kudos-count")
            comment_support = support_elem.text.strip() if support_elem else "0"
            comment_row = {
                "Comment ID": comment_id,
                "Post ID": post_id,
                "Category": category,
                "Comment Author": comment_author,
                "Comment Date": comment_date,
                "Comment Content": comment_content,
                "Comment Support": comment_support,
                "Post URL": post_url
            }
            comments.append(comment_row)
            existing_comment_ids.add(comment_id)
            scraped += 1
        nxt = soup.find("a", rel="next")
        if not nxt or not nxt.get("href"):
            break
        url = "https://forums.beyondblue.org.au" + nxt["href"] if nxt["href"].startswith("/") else nxt["href"]
        time.sleep(polite_delay)
    comments = sorted(comments, key=lambda c: c["Comment Date"])
    return comments


def beyondblue_scraping(tag: str, start_url: str, pages: int = 20, polite_delay=2):
    posts_csv = os.path.join(OUTPUT_DIR, f"posts_{tag}.csv")
    comments_csv = os.path.join(OUTPUT_DIR, f"comments_{tag}.csv")
    existing_post_ids = get_existing_ids(posts_csv, "Post ID")
    existing_comment_ids = get_existing_ids(comments_csv, "Comment ID")
    all_posts = []
    all_comments = []

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.set_page_load_timeout(30)
    url = start_url
    try:
        for p in tqdm(range(1, pages + 1), desc=f"Scraping {tag}"):
            for attempt in range(3):
                try:
                    driver.get(url)
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.CLASS_NAME, "custom-message-list"))
                    )
                    soup = BeautifulSoup(driver.page_source, "html.parser")
                    break
                except Exception:
                    if attempt == 2:
                        print(f"Failed to load page {url}")
                        return
                    time.sleep(2)
            container = soup.find("div", class_="custom-message-list all-discussions")
            if not container:
                break
            for art in container.find_all("article"):
                aside = art.find("aside")
                cat_div = aside.find("div", class_="custom-tile-category-content") if aside else None
                post_cat = clean_text(cat_div.find("a").text.strip()) if cat_div and cat_div.find("a") else tag
                # --- DATE ---
                time_el = cat_div.find("time") if cat_div else None
                if time_el and time_el.has_attr('datetime'):
                    raw_date = time_el['datetime']
                elif time_el and time_el.has_attr('title'):
                    raw_date = time_el['title']
                elif time_el:
                    raw_date = time_el.text.strip()
                else:
                    raw_date = ""
                date = standardize_date(raw_date)
                # --- SUPPORT AND REPLIES ---
                support_li = art.find("li", class_="custom-tile-kudos")
                support_span = support_li.find("span") if support_li else None
                post_support = support_span.text.strip() if support_span else "0"

                replies_li = art.find("li", class_="custom-tile-replies")
                replies_b = replies_li.find("b") if replies_li else None
                total_comment_count = replies_b.text.strip() if replies_b else "0"
                # --- POST TITLE/URL/CONTENT ---
                h3 = art.find("h3")
                link_els = h3.find_all("a") if h3 else []
                link_el = link_els[1] if len(link_els) > 1 else (link_els[0] if link_els else None)
                post_link = link_el["href"] if link_el and link_el.has_attr("href") else ""
                post_id = post_link.rstrip("/").split("/")[-1] if post_link else ""
                if not post_id or post_id in existing_post_ids:
                    continue
                full_link = "https://forums.beyondblue.org.au" + post_link if post_link.startswith("/") else post_link
                title = convert_emojis_emoticons(link_el.text.strip()) if link_el else ""
                body = art.find("p", class_="body-text")
                content = convert_emojis_emoticons(body.text.strip()) if body else ""
                auth_div = aside.find("div", class_="custom-tile-author-info") if aside else None
                auth_a = auth_div.find("a") if auth_div else None
                author = clean_text(auth_a.get_text(strip=True)) if auth_a else ""
                post_content = content
                # --- Get real post support from detail page ---
                for attempt in range(3):
                    try:
                        driver.execute_script("window.open('');")
                        driver.switch_to.window(driver.window_handles[1])
                        driver.get(full_link)
                        WebDriverWait(driver, 10).until(
                            EC.presence_of_element_located((By.CLASS_NAME, "lia-message-body-content"))
                        )
                        post_soup = BeautifulSoup(driver.page_source, "html.parser")
                        content_elems = post_soup.find_all("div", class_="lia-message-body-content")
                        if content_elems:
                            post_content = "\n".join(
                                [convert_emojis_emoticons(elem.get_text("\n", strip=True)) for elem in content_elems if
                                 elem.get_text(strip=True)])
                        # Get support count from first post in thread
                        support_span_detail = post_soup.find("span", {"id": re.compile(r"^kudos-count-")})
                        if not support_span_detail:
                            support_span_detail = post_soup.find("span",
                                                                 class_="lia-component-kudos-widget-message-kudos-count")
                        if support_span_detail:
                            post_support = support_span_detail.text.strip()
                        # Get post date from detail if missing
                        time_elem = post_soup.find("time")
                        if not date and time_elem:
                            if time_elem.has_attr("datetime"):
                                date = standardize_date(time_elem["datetime"])
                            elif time_elem.has_attr("title"):
                                date = standardize_date(time_elem["title"])
                            else:
                                date = standardize_date(time_elem.text.strip())
                        # Scrape all comments (not for count, but real data)
                        comments = comment_scraping(driver, full_link, post_id, tag, max_comments=200)
                        new_comments = []
                        for c in comments:
                            if c["Comment ID"] not in existing_comment_ids:
                                new_comments.append(c)
                                existing_comment_ids.add(c["Comment ID"])
                        all_comments.extend(new_comments)
                        driver.close()
                        driver.switch_to.window(driver.window_handles[0])
                        break
                    except Exception as e:
                        if attempt == 2:
                            print(f"Failed to extract post page {full_link}: {e}")
                            try:
                                driver.close()
                                driver.switch_to.window(driver.window_handles[0])
                            except:
                                pass
                        else:
                            time.sleep(2)
                post_data = {
                    "Post ID": post_id,
                    "Category": post_cat,
                    "Post Title": title,
                    "Post Author": author,
                    "Post Date": date,
                    "Post Content": post_content,
                    "Support Count": post_support,
                    "Total Number of Comments": total_comment_count,
                    "Post URL": full_link
                }
                all_posts.append(post_data)
                existing_post_ids.add(post_id)
                time.sleep(polite_delay)
            nxt_li = soup.find("li", class_="lia-paging-page-next")
            if nxt_li and nxt_li.find("a"):
                next_href = nxt_li.find("a")["href"]
                url = "https://forums.beyondblue.org.au" + next_href if next_href.startswith("/") else next_href
            else:
                break
            if p % 5 == 0:
                dfp = pd.DataFrame(all_posts)
                dfp.sort_values(by="Post Date", inplace=True)
                dfp.to_csv(posts_csv, index=False)
                dfc = pd.DataFrame(all_comments)
                dfc.sort_values(by="Comment Date", inplace=True)
                dfc.to_csv(comments_csv, index=False)
    finally:
        driver.quit()
    if all_posts:
        dfp = pd.DataFrame(all_posts)
        if os.path.exists(posts_csv):
            dfp_existing = pd.read_csv(posts_csv)
            dfp = pd.concat([dfp_existing, dfp], ignore_index=True)
            dfp.drop_duplicates(subset=["Post ID"], inplace=True)
        dfp.sort_values(by="Post Date", inplace=True)
        dfp.to_csv(posts_csv, index=False)
        print(f"Saved {len(dfp)} posts to {posts_csv}")
    if all_comments:
        dfc = pd.DataFrame(all_comments)
        if os.path.exists(comments_csv):
            dfc_existing = pd.read_csv(comments_csv)
            dfc = pd.concat([dfc_existing, dfc], ignore_index=True)
            dfc.drop_duplicates(subset=["Comment ID"], inplace=True)
        dfc.sort_values(by="Comment Date", inplace=True)
        dfc.to_csv(comments_csv, index=False)
        print(f"Saved {len(dfc)} comments to {comments_csv}")


if __name__ == "__main__":
    mental_health_urls = {
        "depression": "https://forums.beyondblue.org.au/t5/depression/bd-p/c1-sc2-b2?&sort=recent",
        "long_term_support": "https://forums.beyondblue.org.au/t5/long-term-support-over-the/bd-p/c1-sc3-b5?&sort=recent",
        "young_people": "https://forums.beyondblue.org.au/t5/young-people/bd-p/c1-sc4-b1?&sort=recent",
        "Sex_identity": "https://forums.beyondblue.org.au/t5/sexuality-and-gender-identity/bd-p/c1-sc4-b2?&sort=recent",
        "Multiculture": "https://forums.beyondblue.org.au/t5/multicultural-experiences/bd-p/c1-sc4-b3?&sort=recent",
        "Grief_loss": "https://forums.beyondblue.org.au/t5/grief-and-loss/bd-p/c1-sc4-b4?&sort=recent"
    }
    for tag, addr in mental_health_urls.items():
        try:
            beyondblue_scraping(tag, addr, pages=100)
        except Exception as e:
            print(f"Error scraping {tag}: {e}")
            continue

Scraping depression:   4%|▍         | 4/100 [00:11<04:35,  2.87s/it]


Error scraping depression: 'Post Date'


Scraping long_term_support:   4%|▍         | 4/100 [00:08<03:18,  2.07s/it]


Error scraping long_term_support: 'Post Date'


Scraping young_people: 100%|██████████| 100/100 [1:29:55<00:00, 53.96s/it]


Saved 2000 posts to Data\posts_young_people.csv
Saved 1000 comments to Data\comments_young_people.csv


Scraping Sex_identity:  77%|███████▋  | 77/100 [1:17:49<23:14, 60.64s/it]


Saved 1526 posts to Data\posts_Sex_identity.csv
Saved 776 comments to Data\comments_Sex_identity.csv


Scraping Multiculture:  25%|██▌       | 25/100 [29:04<1:27:14, 69.79s/it]


Saved 506 posts to Data\posts_Multiculture.csv
Saved 256 comments to Data\comments_Multiculture.csv


Scraping Grief_loss:  87%|████████▋ | 87/100 [1:25:10<12:43, 58.74s/it]


Saved 1725 posts to Data\posts_Grief_loss.csv
Saved 875 comments to Data\comments_Grief_loss.csv
