### TODO

Core Features
- [x] Handle missing values in structured outputs
- [x] Test other APIs - use env variables in yaml
- [ ] Consider shipping JUST the sculptors as a library
- [ ] Why is it making up fields when I use Deepinfra?

Helper Improvements - maybe ship separately
- [ ] Test helpers for datasources
- [ ] String together steps
- [ ] Clean up visualizer
- [ ] Delete extra files

# Test FULL PIPELINE

In [1]:
%load_ext autoreload
%autoreload 2

import os
import toml
from sculptor import SculptorPipeline

secrets = toml.load("secrets.toml")
os.environ["OPENAI_API_KEY"] = secrets["openai"]["api_key"]
os.environ["DEEPINFRA_API_KEY"] = secrets["deepinfra"]["api_key"]
os.environ["REDDIT_CLIENT_ID"] = secrets["reddit"]["client_id"]
os.environ["REDDIT_CLIENT_SECRET"] = secrets["reddit"]["client_secret"]
os.environ["REDDIT_USER_AGENT"] = secrets["reddit"]["user_agent"]

pipeline = SculptorPipeline.from_config('examples/ai_therapy.yaml')
results = pipeline.process_from_config(n_workers=4)


Step 1/2


Processing items: 100%|██████████| 20/20 [00:13<00:00,  1.52it/s]


Filtered to 6 items

Step 2/2


Processing items: 100%|██████████| 6/6 [00:18<00:00,  3.01s/it]


FileNotFoundError: [Errno 2] No such file or directory: 'output/ai_therapy_out.csv'

# Test on people.csv

In [2]:
%load_ext autoreload
%autoreload 2

import os
import toml

secrets = toml.load("secrets.toml")
os.environ["OPENAI_API_KEY"] = secrets["openai"]["api_key"]
os.environ["DEEPINFRA_API_KEY"] = secrets["deepinfra"]["api_key"]

In [4]:
import pandas as pd
from sculptor import Sculptor
from sculptor import SculptorPipeline
from helpers.data_sources import CSVDataSource

# 1. Create our sculptors
relevance_sculptor = Sculptor(
    schema={
        "is_valid_sample": {"type": bool, "description": "True only if this text contains information about a person."},
        "explanation": {"type": str, "description": "Explain why this sample is or is not valid."}
    },
    instructions="Determine if the following text contains information about a person.",
    template="Text: {text}"
)

demo_sculptor = Sculptor.from_config("examples/demosculpt.yaml")
demo_sculptor.add("first_letter", str, "First letter of the persons first name")

# 2. Create and configure the pipeline
pipeline = (SculptorPipeline()
    .add(relevance_sculptor, lambda x: x['is_valid_sample'])  # Filter on is_valid_sample
    .add(demo_sculptor))

# 3. Load and process the data
csv_source = CSVDataSource("examples/people.csv")
df = csv_source.get_data()

# 4. Run the pipeline (will preserve all columns by default)
results = pipeline.process(
    df.to_dict('records'),
    n_workers=4,  # Parallel processing
    show_progress=True
)

# 5. Convert back to DataFrame
extracted_df = pd.DataFrame(results)

# Display results
extracted_df.head(20)


Step 1/2


Processing items: 100%|██████████| 11/11 [00:02<00:00,  4.89it/s]


Filtered to 8 items

Step 2/2


Processing items: 100%|██████████| 8/8 [00:02<00:00,  3.17it/s]


Unnamed: 0,text,is_valid_sample,explanation,name,age,city,occupation,interests,is_married,num_children,net_worth,first_letter
0,"Alice is 30 years old, lives in New York, and ...",True,The text contains detailed information about a...,Alice,30,New York,software engineer,"[hiking, reading]",False,1.0,1200000.0,A
1,"Bob, 25, is a teacher in London. He's an avid ...",True,The text contains information about a person n...,Bob,25,London,teacher,[cycling],True,2.0,500000.0,B
2,Charlie is a 40-year-old data scientist from C...,True,The text contains information about a person n...,Charlie,40,Chicago,data scientist,"[skiing, cooking, photography]",True,1.0,800000.0,C
3,"David, a 35-year-old architect, resides in San...",True,The text contains information about a person n...,David,35,San Francisco,architect,[rock climbing],False,,1500000.0,D
4,Emily is a 28-year-old nurse in Seattle. She l...,True,The text contains information about a person n...,Emily,28,Seattle,nurse,"[traveling, trying new foods]",False,,400000.0,E
5,Frank is a 50-year-old lawyer living in Boston...,True,The text contains information about a person n...,Frank,50,Boston,lawyer,"[golfing, fishing]",True,3.0,3.2,F
6,"Grace, a 22-year-old student in Austin, is pas...",True,The text contains information about a person n...,Grace,22,Austin,student,"[music, volunteering]",False,,,G
7,"Katrina, a 28-year-old art expert in NYC.",True,The text contains information about a person n...,Katrina,28,NYC,art expert,[],False,,,K


# Test full AI Therapy Pipeline

In [12]:
%load_ext autoreload
%autoreload 2

import os
import toml
import praw
import pandas as pd
from sculptor import Sculptor
from sculptor import SculptorPipeline
from helpers.data_sources import RedditDataSource

secrets = toml.load("secrets.toml")
os.environ["OPENAI_API_KEY"] = secrets["openai"]["api_key"]
os.environ["DEEPINFRA_API_KEY"] = secrets["deepinfra"]["api_key"]

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
reddit = praw.Reddit(
            client_id=secrets["reddit"]["client_id"],
            client_secret=secrets["reddit"]["client_secret"],
            user_agent=secrets["reddit"]["user_agent"]
        )

reddit_src1 = RedditDataSource(
    reddit_client=reddit,
    query="(AI OR chatbot OR GPT) AND (mental health OR therapy)",
    include_comments=False,
    limit=10
)
reddit_src2 = RedditDataSource(
    reddit_client=reddit,
    query="(AI OR chatbot OR GPT)",
    subreddits=["ADHD", "anxiety"],
    include_comments=False,
    limit=10
)

dfs = [reddit_src1.get_data(), reddit_src2.get_data()]
df = pd.concat(dfs, ignore_index=True).drop_duplicates(subset='id')

In [13]:
pipeline = SculptorPipeline.from_config('examples/ai_therapy.yaml')
results = pipeline.process(df)


Step 1/2


Processing items: 100%|██████████| 20/20 [01:16<00:00,  3.81s/it]


Filtered to 6 items

Step 2/2


Processing items: 100%|██████████| 6/6 [00:55<00:00,  9.32s/it]


In [15]:
results_df = pd.DataFrame(results)
results_df.head()

Unnamed: 0,id,text,title,context_text,url,subreddit,score,created_utc,is_comment,comment_id,...,relevant_sample_explanation,sentiment,benefits,downsides,use_cases,conditions,seeing_provider,previous_provider,provider_problems,analysis_notes
0,1fajq7r_post,I've been desperately trying to figure out wha...,Chat GPT Transforms My Mental Health In 2 Weeks,,https://reddit.com/r/ChatGPT/comments/1fajq7r/...,ChatGPT,775,2024-09-06 16:40:03,False,,...,The user explicitly describes their personal e...,9,"[improved self-esteem, increased confidence, b...",[none mentioned],"[CBT, journaling, venting, goal setting]",[low self-esteem],False,True,[slow progress],The user credits Chat GPT with transforming th...
1,1gmmujy_post,Hey!\n\nYou probably heard about people using ...,Using ChatGPT as a tool to improve your mental...,,https://reddit.com/r/DecidingToBeBetter/commen...,DecidingToBeBetter,756,2024-11-08 16:40:52,False,,...,The user explicitly describes their personal e...,9,"[new perspectives, valuable advice, progress i...",[initial skepticism],"[venting, problem-solving]","[broken mind, being stuck]",False,True,[previous providers didn't work for the user],The user is surprised by the effectiveness of ...
2,1bhxtuf_post,"For me specifically, I am looking for somethin...",What are the best ai assistants or ai tools fo...,,https://reddit.com/r/ADHD/comments/1bhxtuf/wha...,ADHD,24,2024-03-18 18:23:40,False,,...,The user shares a personal experience of strug...,8,[potential for improved executive functioning ...,[ineffective memory in chat GPT],"[personal assistant, task reminders, prioritiz...",[ADHD],False,False,[none mentioned],The user is seeking an AI assistant to support...
3,1hjkwj8_post,I feel like my RSD plays a huge role in my rel...,ChatGPT Hack,,https://reddit.com/r/ADHD/comments/1hjkwj8/cha...,ADHD,11,2024-12-21 22:31:38,False,,...,The user explicitly describes their personal e...,9,"[emotional regulation, safe space for venting,...",[none mentioned],"[venting, journaling, conflict resolution]","[ADHD, RSD (Rejection Sensitive Dysphoria)]",False,False,[none mentioned],The user discovered a creative hack using Chat...
4,1h2iv07_post,This is from a comment I made on some tech pos...,Using “AI” to organize my scatter Brain thoughts,,https://reddit.com/r/ADHD/comments/1h2iv07/usi...,ADHD,0,2024-11-29 10:22:13,False,,...,The user explicitly describes their personal e...,9,"[Improved organization, Increased productivity...",[None mentioned],"[Organizing thoughts, Breaking down ideas into...",[ADHD],False,False,[None mentioned],The user utilizes chatGPT to manage their ADHD...


# Old

In [7]:
CONFIG_TAG = "ai_therapy"
CONFIG_FILE = f"configs/{CONFIG_TAG}.toml"

In [8]:
import toml
import pandas as pd
from openai import OpenAI
from helpers.datasource import RedditDataSource, HackerNewsDataSource

from helpers.processor import Processor, FILTER_PROMPT
import helpers.visualizer as visualizer

secrets = toml.load("secrets.toml")
config = toml.load(CONFIG_FILE)

ANALYSIS_USE_CASE = config["use_case"]
FIELDS = config["fields"]

DEEPINFRA_CONFIG = secrets.get("deepinfra", {})
DEEPINFRA_LLM = OpenAI(
  api_key=DEEPINFRA_CONFIG.get("api_key"),
  base_url=DEEPINFRA_CONFIG.get("base_url"), 
)

OPENAI_CONFIG = secrets.get("openai", {})
OPENAI_LLM = OpenAI(
  api_key=OPENAI_CONFIG.get("api_key"),
  base_url=OPENAI_CONFIG.get("base_url"), 
)

processor = Processor(
    use_case_description=ANALYSIS_USE_CASE,
    filter_prompt=FILTER_PROMPT,
    extraction_schema=FIELDS,
    filter_llm_client=DEEPINFRA_LLM,
    filter_model="meta-llama/Llama-3.3-70B-Instruct-Turbo",  # Cheaper model for filtering
    extract_llm_client=DEEPINFRA_LLM,
    extract_model="meta-llama/Meta-Llama-3.1-405B-Instruct",  # More accurate model for extraction
)

In [3]:
data_sources = []

for ds_conf in config["data_sources"]:
    ds_type = ds_conf.pop("type", None)
    if ds_type == "reddit":
        data_sources.append(RedditDataSource(**ds_conf))
    elif ds_type == "hackernews":
        data_sources.append(HackerNewsDataSource(**ds_conf))
    else:
        print(f"Unknown data source type: {ds_type}")

In [4]:
dfs = [source.get_data() for source in data_sources]
df = pd.concat(dfs, ignore_index=True).drop_duplicates(subset='id')

In [None]:
# df_filtered = processor.filter_data(df.sample(n=100, random_state=42))  # For testing with a small random sample
df_filtered = processor.filter_data(df)  # 1st level filtering
df_extracted = processor.extract_fields(df_filtered)  # Extraction of structured data
samples = df_extracted[df_extracted['relevant_sample'] == True]  # 2nd level filtering
print(f"Samples: initial={len(df)}, after 1st filter={len(df_filtered[df_filtered['is_relevant']==True])}, final={len(samples)}")

In [None]:
# # Backup dataframes to output files
# df.to_json(f"out/{CONFIG_TAG}.json", orient='records', date_format='iso')
# df_filtered.to_json(f"out/{CONFIG_TAG}-filtered.json", orient='records', date_format='iso') 
# samples.to_json(f"out/{CONFIG_TAG}-samples.json", orient='records', date_format='iso')

In [None]:
# Simple plots are created automatically by vizualizer
viz = visualizer.Visualization(samples, FIELDS)
viz.plot_all_fields(show_examples=True)
viz.show_samples(n=3, extra_fields=['sentiment','use_cases'])
viz.plot_by_time('created_utc', "Posts Over Time")
# viz.plot_group_comparison('subreddit', 'sentiment', agg='mean')
# viz.plot_correlation(['sentiment'])