In [4]:
import toml
import pandas as pd
from openai import OpenAI
from datasource import RedditDataSource
from processor import Processor

# Load secrets and set up OpenAI clients
secrets = toml.load("secrets.toml")
openai_key = secrets["OPEN_AI_KEY"]

filter_llm_client = OpenAI(api_key=openai_key)
extract_llm_client = OpenAI(api_key=openai_key)

ANALYSIS_USE_CASE = """
We are analyzing Reddit posts to understand how people are using AI and chatbots for mental health...
(Your original text as needed)
"""

FIELDS = {
    "relevant_sample": {"type": "boolean", "description": "..."},
    "relevant_sample_explanation": {"type": "string", "description": "..."},
    "sentiment": {"type": "integer", "description": "..."},
    "benefits": {"type": "array", "items": {"type":"string"}, "description":"..."},
    "downsides": {"type": "array", "items": {"type":"string"}, "description":"..."},
    "use_cases": {"type": "array", "items": {"type":"string"}, "description":"..."},
    "conditions": {"type": "array", "items": {"type":"string"}, "description":"..."},
    "seeing_provider": {"type": "boolean", "description":"..."},
    "previous_provider": {"type": "boolean", "description":"..."},
    "provider_problems": {"type": "array", "items":{"type":"string"}, "description":"..."},
    "fields_explanation": {"type":"string", "description":"..."}
}

FILTER_PROMPT = "Is this text relevant to the described use case? Answer 'Yes' or 'No' and explain briefly."


In [5]:
processor = Processor(
    use_case_description=ANALYSIS_USE_CASE,
    filter_prompt=FILTER_PROMPT,
    extraction_schema=FIELDS,
    filter_llm_client=filter_llm_client,
    extract_llm_client=extract_llm_client,
    filter_model="gpt-4o-mini",  # Cheaper model for filtering
    extract_model="gpt-4o-mini"       # More accurate model for extraction
)

In [6]:
data_source = RedditDataSource(
    query="(AI OR chatbot OR GPT) AND (mental health OR therapy)",
    limit=20,
    include_comments=True
)
df = data_source.get_data()

TypeError: RedditDataSource.__init__() missing 2 required positional arguments: 'subreddits' and 'reddit_client'

In [None]:



# Load data from Reddit, no subreddits specified so defaults to 'all'


# If you wanted multiple sources:
# data_source2 = RedditDataSource(query="(AI OR chatbot OR GPT)", limit=10, include_comments=False, subreddits=["ADHD","anxiety"])
# df2 = data_source2.get_data()
# df = pd.concat([df, df2], ignore_index=True).drop_duplicates(subset='id')

# Filter data
df_filtered = processor.filter_data(df)

# Extract fields
df_extracted = processor.extract_fields(df_filtered)

df_extracted.head()

# After this, you can visualize, group, etc. outside of the processor.
# For example, to plot sentiment distribution:
# import plotly.express as px
# df_relevant = df_extracted[df_extracted["is_relevant"] == True]
# if "sentiment" in df_relevant.columns and df_relevant["sentiment"].notnull().any():
#     fig = px.histogram(df_relevant, x="sentiment", title="Sentiment distribution")
#     fig.show()
