In [1]:
FILTER_PROMPT = "Is this text relevant to the described use case? Answer 'Yes' or 'No' and explain briefly."

ANALYSIS_USE_CASE = """
We are analyzing Reddit posts to understand how people are using AI and chatbots for mental health, coaching, or emotional support.
Specifically, we want to identify posts where users share their personal experiences using AI tools for:
- Managing mental health conditions (anxiety, depression, ADHD, OCD, PTSD, trauma, etc.)
- Emotional support and wellbeing
- Therapy supplements or alternatives
- Wellness coaching and goal setting
- Help focusing, goal setting, managing stress, overcoming obstacles, etc.
- Other similar use cases for AI in mental health

The post should include first-hand experience using AI tools, not just general discussion about AI in mental health.
This does NOT need to be the main focus of the post, but it should clearly mention using AI for the use case described.
We want to extract structured data about their experiences, including benefits, challenges, and specific use cases.
Do NOT make stuff up.  ONLY use keywords that accurately fit what the schema describes. 
A keyword that applies to the post generally but not specifically to what is asked for by the schema should not be used.
"""

FIELDS = {
    "relevant_sample": {
        "type": "boolean",
        "description": "Boolean indicating if text describes personal experience using AI for the use case described in the prompt"
    },
    "relevant_sample_explanation": {
        "type": "string",
        "description": "Explanation of why the sample was classified as relevant or not relevant"
    },
    "sentiment": {
        "type": "integer",
        "description": "Integer 1-10 indicating sentiment TOWARDS using AI for mental health (10 most positive).  This is NOT sentiment of the post overall, just sentiment towards the interaction with AI."
    },
    "benefits": {
        "type": "array",
        "items": {"type": "string"},
        "description": "List of keywords relating to perceived benefits of using the AI, e.g.: non_judgemental, on_demand, affordable, accessible, anonymous, consistent, supportive, patient"
    },
    "downsides": {
        "type": "array",
        "items": {"type": "string"},
        "description": "List of keywords relating to downsides of using the AI, e.g.: repetitive, robotic, shallow, unreliable, addictive, avoidant, limited"
    },
    "use_cases": {
        "type": "array",
        "items": {"type": "string"},
        "description": "List of keywords relating to how AI is used, e.g.: reflection, venting, self_talk, planning, CBT, journaling, motivation, reminders, emotional_support"
    },
    "conditions": {
        "type": "array",
        "items": {"type": "string"},
        "description": "List of keywords describing conditions being addressed, e.g.: ADHD, depression, anxiety, addiction, OCD, PTSD, bipolar, eating_disorder"
    },
    "seeing_provider": {
        "type": "boolean",
        "description": "Boolean indicating if subject indicates they are CURRENTLY seeing a therapist or mental health provider"
    },
    "previous_provider": {
        "type": "boolean",
        "description": "Boolean indicating if subject indicates they have EVER seen a therapist or mental health provider"
    },
    "provider_problems": {
        "type": "array",
        "items": {"type": "string"},
        "description": "List of keywords relating to perceived issues with HUMAN PROVIDERS, e.g.: expensive, unavailable, inaccessible, scheduling, inconsistent, judgmental"
    },
    "fields_explanation": {
        "type": "string",
        "description": "Concise but thorough explanation of your reasoning for each field in the schema (except for relevant_sample and relevant_sample_explanation)"
    },
}

In [9]:
import toml
import pandas as pd
from openai import OpenAI
from datasource import RedditDataSource
from processor import Processor
import visualizer
from importlib import reload

secrets = toml.load("secrets.toml")

DEEPINFRA = "deepinfra"
FILTER_LLM = OpenAI(
  api_key=secrets.get(DEEPINFRA, {}).get("api_key"),
  base_url=secrets.get(DEEPINFRA, {}).get("base_url"), 
)

OPENAI = "openai"
EXTRACT_LLM = OpenAI(
  api_key=secrets.get(OPENAI, {}).get("api_key"),
  base_url=secrets.get(OPENAI, {}).get("base_url"), 
)

processor = Processor(
    use_case_description=ANALYSIS_USE_CASE,
    filter_prompt=FILTER_PROMPT,
    extraction_schema=FIELDS,
    filter_llm_client=FILTER_LLM,
    extract_llm_client=EXTRACT_LLM,
    filter_model="meta-llama/Llama-3.3-70B-Instruct-Turbo",  # Cheaper model for filtering
    extract_model="gpt-4o-mini"       # More accurate model for extraction
)

In [10]:
data_sources = [RedditDataSource(query="(AI OR chatbot OR GPT) AND (mental health OR therapy)", include_comments=False, limit=10),
                RedditDataSource(query="(AI OR chatbot OR GPT)", include_comments=True, subreddits=["ADHD","anxiety"], limit=10)]

dfs = [source.get_data() for source in data_sources]
df = pd.concat(dfs, ignore_index=True).drop_duplicates(subset='id')
# df = df.sample(n=LIMIT, random_state=42)  # Limit for testing only

In [None]:
df_filtered = processor.filter_data(df)  # 1st level filtering
df_extracted = processor.extract_fields(df_filtered)  # Extraction of structured data
samples = df_extracted[df_extracted['relevant_sample'] == True]  # 2nd level filtering
print(f"Samples: initial={len(df)}, after 1st filter={len(df_filtered[df_filtered['is_relevant']==True])}, final={len(samples)}")

In [None]:

reload(visualizer)
viz = visualizer.Visualization(samples, FIELDS)
viz.plot_all_fields(show_examples=True)
viz.show_samples(n=3, extra_fields=['sentiment','use_cases'])
viz.plot_by_time('created_utc', "Posts Over Time")
viz.plot_group_comparison('subreddit', 'sentiment', agg='mean')
viz.plot_correlation(['sentiment'])  

In [None]:
samples