In [None]:
!pip install praw nltk



# **Importing Libraries**

In [None]:
import praw
import pandas as pd
import re
import nltk
import json
from datetime import datetime
import warnings

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
warnings.filterwarnings('ignore')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# **Reddit API**

In [None]:
# Initialize Reddit API client
def initialize_reddit_client():
    # Replace with your actual Reddit API credentials
    reddit = praw.Reddit(
        client_id="#",
        client_secret="#",
        user_agent="#",

    )
    return reddit


# **Preprocessing**

In [None]:
# Define mental health related keywords
MENTAL_HEALTH_KEYWORDS = [
    "depressed", "depression", "anxiety", "anxious",
    "suicidal", "suicide", "self-harm", "addiction",
    "substance abuse", "alcohol abuse", "drug abuse",
    "overwhelmed", "hopeless", "therapy", "mental health",
    "panic attack"
]

# Function to check if post contains any keywords
def contains_keywords(text, keywords):
    if text is None:
        return False
    text_lower = text.lower()
    for keyword in keywords:
        if keyword.lower() in text_lower:
            return True
    return False

In [None]:
def preprocess_text(text):
    if text is None:
        return ""

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove emojis (basic approach)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    # Remove special characters and keep only alphanumeric and spaces
    text = re.sub(r'[^\w\s]', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    try:
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        word_tokens = word_tokenize(text)
        filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
        return " ".join(filtered_text)
    except Exception as e:
        print(f"Error in text preprocessing: {e}")
        # Fallback if NLTK has issues
        return text

# **Scrapping Reddit**

In [None]:
# Function to scrape subreddits
def scrape_mental_health_posts(subreddits, post_limit=100, time_filter="month"):
    reddit = initialize_reddit_client()
    all_posts = []

    for subreddit_name in subreddits:
        try:
            subreddit = reddit.subreddit(subreddit_name)

            # Get top posts
            for post in subreddit.top(time_filter=time_filter, limit=post_limit):
                title = post.title if hasattr(post, 'title') else ""
                selftext = post.selftext if hasattr(post, 'selftext') else ""

                if contains_keywords(title + " " + selftext, MENTAL_HEALTH_KEYWORDS):
                    # Extract post data
                    post_data = {
                        "post_id": post.id,
                        "subreddit": subreddit_name,
                        "timestamp": datetime.fromtimestamp(post.created_utc).strftime("%Y-%m-%d %H:%M:%S"),
                        "title": title,
                        "content": selftext,
                        "likes": post.score,
                        "comments": post.num_comments,
                        "url": f"https://www.reddit.com{post.permalink}"
                    }

                    # Add preprocessed content
                    post_data["preprocessed_content"] = preprocess_text(title + " " + selftext)

                    all_posts.append(post_data)
        except Exception as e:
            print(f"Error scraping subreddit {subreddit_name}: {e}")

    return all_posts


# **Main Function**

In [None]:
# Main function
def main():

    # List of mental health related subreddits
    mental_health_subreddits = [
        "depression", "anxiety", "mentalhealth", "addiction",
        "suicidewatch", "alcoholism", "stopdrinking", "leaves",
        "depression_help", "anxietyhelp", "mentalhealthsupport"
    ]

    # Scrape posts
    posts = scrape_mental_health_posts(mental_health_subreddits, post_limit=200)

    # Create DataFrame
    df = pd.DataFrame(posts)

    # Save to CSV
    csv_filename = "mental_health_reddit_posts.csv"
    df.to_csv(csv_filename, index=False)
    print(f"Saved {len(posts)} posts to {csv_filename}")




In [None]:
if __name__ == "__main__":
    main()

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Saved 1102 posts to mental_health_reddit_posts.csv
