### TODO

Core Features
- Handle missing values in structured outputs
- Test other APIs - use env variables in yaml
- Consider shipping JUST the sculptors as a library
- Why is it making up fields when I use Deepinfra?

Helper Improvements - maybe ship separately
- Test helpers for datasources
- String together steps
- Clean up visualizer
- Delete extra files

In [3]:
%load_ext autoreload
%autoreload 2

import os
import toml

secrets = toml.load("secrets.toml")
os.environ["OPENAI_API_KEY"] = secrets["openai"]["api_key"]
os.environ["DEEPINFRA_API_KEY"] = secrets["deepinfra"]["api_key"]


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
from sculptor import Sculptor
from sculptor import SculptorPipeline
pipeline = SculptorPipeline.from_config('pipeline_config.yaml')
results = pipeline.process(your_data)

In [2]:
import pandas as pd
from sculptor import Sculptor
from sculptor import SculptorPipeline
from helpers.data_sources import CSVDataSource

# 1. Create our sculptors
relevance_sculptor = Sculptor(
    schema={
        "is_valid_sample": {"type": bool, "description": "True only if this text contains demographic information."},
        "explanation": {"type": str, "description": "Explain why this sample is or is not valid."}
    },
    instructions="Determine if the following text contains demographic information about a person.",
    template="Text: {text}"
)

demo_sculptor = Sculptor.from_config("sample_data/demosculpt.yaml")
demo_sculptor.add("first_letter", str, "First letter of the persons first name")

# 2. Create and configure the pipeline
pipeline = (SculptorPipeline()
    .add(relevance_sculptor, lambda x: x['is_valid_sample'])  # Filter on is_valid_sample
    .add(demo_sculptor))

# 3. Load and process the data
csv_source = CSVDataSource("sample_data/people.csv")
df = csv_source.get_data()

# 4. Run the pipeline (will preserve all columns by default)
results = pipeline.process(
    df.to_dict('records'),
    n_workers=4,  # Parallel processing
    show_progress=True
)

# 5. Convert back to DataFrame
extracted_df = pd.DataFrame(results)

# Display results
extracted_df.head(20)


Step 1/2


Processing items: 100%|██████████| 11/11 [00:03<00:00,  3.66it/s]



Step 2/2


Processing items: 100%|██████████| 8/8 [00:02<00:00,  3.25it/s]


Unnamed: 0,text,is_valid_sample,explanation,name,age,city,occupation,interests,is_married,num_children,net_worth,first_letter
0,"Alice is 30 years old, lives in New York, and ...",True,The text contains demographic information such...,Alice,30,New York,software engineer,"[hiking, reading]",False,1,1200000,A
1,"Bob, 25, is a teacher in London. He's an avid ...",True,The text contains demographic information such...,Bob,25,London,teacher,[cycling],True,2,500000,B
2,Charlie is a 40-year-old data scientist from C...,True,The text contains demographic information such...,Charlie,40,Chicago,data scientist,"[skiing, cooking, photography]",True,1,800000,C
3,"David, a 35-year-old architect, resides in San...",True,The text contains demographic information such...,David,35,San Francisco,architect,[rock climbing],False,0,1500000,D
4,Emily is a 28-year-old nurse in Seattle. She l...,True,The text contains demographic information such...,Emily,28,Seattle,nurse,"[traveling, trying new foods]",False,0,400000,E
5,Frank is a 50-year-old lawyer living in Boston...,True,The text contains demographic information such...,Frank,50,Boston,lawyer,"[golfing, fishing]",True,3,3200000,F
6,"Grace, a 22-year-old student in Austin, is pas...",True,The text contains demographic information such...,Grace,22,Austin,student,"[music, volunteering]",False,0,0,G
7,"Katrina, a 28-year-old art expert in NYC.",True,The text contains demographic information such...,Katrina,28,NYC,art expert,[],False,0,0,K


In [31]:
import pandas as pd
from sculptor import Sculptor
from helpers.data_sources import CSVDataSource

# 1. Create a Sculptor for relevance checking - using extraction_schema
relevance_sculptor = Sculptor(
    schema={"is_valid_sample": {"type": bool, "description": "True only if this text contains demographic information.",},
            "explanation": {"type": str,"description": "Explain why this sample is or is not valid.",},},
    instructions="Determine if the following text contains demographic information about a person.",
    template="Text: {text}",
)

# 2. Create a Sculptor for data extraction - using add()
demo_sculptor = Sculptor.from_config("sample_data/demosculpt.yaml")
demo_sculptor.add("first_letter", str, "First letter of the persons first name")  # You can add fields like this

csv_source = CSVDataSource("sample_data/people.csv")
df = csv_source.get_data()

# 3. Apply the relevance sculptor
df["relevance_data"] = df.apply(lambda row: relevance_sculptor.sculpt(row), axis=1)
df["is_valid_sample"] = df["relevance_data"].apply(lambda x: x.get("is_valid_sample"))
df["explanation"] = df["relevance_data"].apply(lambda x: x.get("explanation"))
df.drop(columns=["relevance_data"], inplace=True)  # Drop the intermediate column

# 4. Filter based on relevance
relevant_df = df[df["is_valid_sample"] == True].copy()

# 5. Apply the data extraction sculptor
relevant_df["extracted_data"] = relevant_df.apply(lambda row: demo_sculptor.sculpt(row), axis=1)

# 6. Expand the extracted data into separate columns
extracted_df = pd.concat([relevant_df, relevant_df["extracted_data"].apply(pd.Series)], axis=1)
extracted_df.drop(columns=["extracted_data", "is_valid_sample", "explanation"], inplace=True)

extracted_df.head(20)

Unnamed: 0,text,name,age,city,occupation,interests,is_married,num_children,net_worth,first_letter
0,"Alice is 30 years old, lives in New York, and ...",Alice,30,New York,software engineer,"[hiking, reading]",False,1,1200000,A
1,"Bob, 25, is a teacher in London. He's an avid ...",Bob,25,London,teacher,[cycling],True,2,500000,B
3,Charlie is a 40-year-old data scientist from C...,Charlie,40,Chicago,data scientist,"[skiing, cooking, photography]",True,1,800000,C
5,"David, a 35-year-old architect, resides in San...",David,35,San Francisco,architect,[rock climbing],False,0,1500000,D
6,Emily is a 28-year-old nurse in Seattle. She l...,Emily,28,Seattle,nurse,"[traveling, trying new foods]",False,0,400000,E
8,Frank is a 50-year-old lawyer living in Boston...,Frank,50,Boston,lawyer,"[golfing, fishing]",True,3,3200000,F
9,"Grace, a 22-year-old student in Austin, is pas...",Grace,22,Austin,student,"[music, volunteering]",False,0,0,G
10,"Katrina, a 28-year-old art expert in NYC.",Katrina,28,NYC,art expert,[],False,0,0,K


In [19]:
extracted_df

Unnamed: 0,text,name,age,city,occupation,interests,is_married,num_children,net_worth
0,"Alice is 30 years old, lives in New York, and ...",Alice,30,New York,software engineer,"[hiking, reading]",False,1,1200000
1,"Bob, 25, is a teacher in London. He's an avid ...",Bob,25,London,teacher,[cycling],True,2,500000
3,Charlie is a 40-year-old data scientist from C...,Charlie,40,Chicago,data scientist,"[skiing, cooking, photography]",True,1,800000
5,"David, a 35-year-old architect, resides in San...",David,35,San Francisco,architect,[rock climbing],False,0,1500000
6,Emily is a 28-year-old nurse in Seattle. She l...,Emily,28,Seattle,nurse,"[traveling, trying new foods]",False,0,400000
8,Frank is a 50-year-old lawyer living in Boston...,Frank,50,Boston,lawyer,"[golfing, fishing]",True,3,3200000
9,"Grace, a 22-year-old student in Austin, is pas...",Grace,22,Austin,student,"[music, volunteering]",False,0,0
10,"Katrina, a 28-year-old art expert in NYC.",Katrina,28,NYC,art expert,[],False,0,0


In [None]:
import pandas as pd
from sculptor import Sculptor
from sculptor.helpers.data_sources import CSVDataSource, RedditDataSource
import praw

# Create a Sculptor instance
sculptor = Sculptor(
    extraction_schema={
        "name": {"type": "string", "description": "The entity's name"},
        "summary": {"type": "string", "description": "A summary"}
    },
    template="Entity: {text}"
)

# Get data from a CSV using the helper
csv_source = CSVDataSource("my_data.csv", sep=",")
df_csv = csv_source.get_data()

# Get data from Reddit (you'll need to set up your PRAW Reddit client)
reddit = praw.Reddit(client_id="YOUR_CLIENT_ID", client_secret="YOUR_CLIENT_SECRET", user_agent="YOUR_USER_AGENT")
reddit_source = RedditDataSource(query="artificial intelligence", subreddits=["machinelearning"], reddit_client=reddit)
df_reddit = reddit_source.get_data()

# Apply Sculptor to the DataFrames
df_csv['extracted'] = df_csv.apply(lambda row: sculptor.sculpt(row), axis=1)
df_reddit['extracted'] = df_reddit.apply(lambda row: sculptor.sculpt(row), axis=1)

print(df_csv)
print(df_reddit)

# Test full AI Therapy Pipeline

In [4]:
%load_ext autoreload
%autoreload 2

import os
import toml
import praw
import pandas as pd
from sculptor import Sculptor
from sculptor import SculptorPipeline
from helpers.data_sources import RedditDataSource

secrets = toml.load("secrets.toml")
os.environ["OPENAI_API_KEY"] = secrets["openai"]["api_key"]
os.environ["DEEPINFRA_API_KEY"] = secrets["deepinfra"]["api_key"]

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
reddit = praw.Reddit(
            client_id=secrets["reddit"]["client_id"],
            client_secret=secrets["reddit"]["client_secret"],
            user_agent=secrets["reddit"]["user_agent"]
        )

reddit_src1 = RedditDataSource(
    reddit_client=reddit,
    query="(AI OR chatbot OR GPT) AND (mental health OR therapy)",
    include_comments=False,
    limit=10
)
reddit_src2 = RedditDataSource(
    reddit_client=reddit,
    query="(AI OR chatbot OR GPT)",
    subreddits=["ADHD", "anxiety"],
    include_comments=False,
    limit=10
)

dfs = [reddit_src1.get_data(), reddit_src2.get_data()]
df = pd.concat(dfs, ignore_index=True).drop_duplicates(subset='id')

In [10]:
pipeline = SculptorPipeline.from_config('examples/ai_therapy.yaml')
results = pipeline.process(df)


Step 1/2


Processing items:   5%|▌         | 1/20 [00:03<01:02,  3.27s/it]

LLM Output: {
    "relevant_sample": true,
    "relevant_sample_explanation": "The user explicitly describes their personal experience of using Chat GPT for mental health support, sharing how it helped them build confidence, reframe negative thoughts, and make progress in a short amount of time. They provide specific details about their interactions with Chat GPT, such as using the voice feature for daily conversations and receiving guidance on overcoming self-esteem issues."
}


Processing items:  10%|█         | 2/20 [00:05<00:52,  2.89s/it]

LLM Output: {"relevant_sample": false, "relevant_sample_explanation": "The post title and text do not indicate a personal experience of using AI for mental health, coaching, or emotional support. Instead, it appears to be a comparative discussion between AI models and mental disabilities, lacking a first-hand account of AI usage for mental health purposes."}


Processing items:  15%|█▌        | 3/20 [00:10<01:01,  3.60s/it]

LLM Output: {"relevant_sample": true, "relevant_sample_explanation": "The user shares a clear, first-hand experience of using ChatGPT for mental health support, describing how it provided valuable advice and perspectives that helped them make progress with their issues, and explicitly states that it 'feels like it understands me better than all the people I ever talked to about my problems'."}


Processing items:  20%|██        | 4/20 [00:13<00:52,  3.27s/it]

LLM Output: {
    "relevant_sample": false,
    "relevant_sample_explanation": "The post discusses a data breach involving a US-based AI healthcare firm, but it does not share a personal experience of using AI for mental health, coaching, or emotional support. The focus is on a privacy risk rather than an individual's experience with AI in mental health."
}


Processing items:  25%|██▌       | 5/20 [00:15<00:46,  3.12s/it]

LLM Output: {"relevant_sample": false, "relevant_sample_explanation": "The post does not mention the use of AI or chatbots for mental health, coaching, or emotional support. It is a personal story about a family conflict related to a wedding and past bullying, but does not involve AI in any way."}


Processing items:  30%|███       | 6/20 [00:18<00:39,  2.79s/it]



Processing items:  35%|███▌      | 7/20 [00:19<00:30,  2.32s/it]

LLM Output: {"relevant_sample": false, "relevant_sample_explanation": "The post discusses the potential of AI in video generation, comparing it to advancements in text generation, but does not share a personal experience of using AI for mental health, coaching, or emotional support."}


Processing items:  40%|████      | 8/20 [00:21<00:27,  2.27s/it]

LLM Output: {"relevant_sample": false, "relevant_sample_explanation": "The post does not mention using AI or chatbots for mental health, coaching, or emotional support, and does not describe a personal experience related to these topics."}


Processing items:  45%|████▌     | 9/20 [00:24<00:25,  2.32s/it]

LLM Output: {
    "relevant_sample": false,
    "relevant_sample_explanation": "The post does not mention a personal experience of using AI for mental health, coaching, or emotional support. It appears to be a news article about a lawsuit against Sam Altman, with no relevance to the use of AI in mental health."
}


Processing items:  50%|█████     | 10/20 [00:26<00:22,  2.27s/it]

LLM Output: {"relevant_sample": false, "relevant_sample_explanation": "The post lacks a clear, first-hand experience of using AI or chatbots for mental health, support, coaching, or similar. The text 'mentality!!!' is too vague and does not provide any personal experience or description of using AI tools."}


Processing items:  55%|█████▌    | 11/20 [00:29<00:23,  2.60s/it]

LLM Output: {
    "relevant_sample": false,
    "relevant_sample_explanation": "The post does not share a personal experience of using AI for mental health, coaching, or emotional support. Instead, it discusses the author's discovery that a book on ADHD was likely generated by ChatGPT and warns others about it. The post is more focused on criticizing AI-generated content than sharing a personal experience with AI in mental health."
}


Processing items:  60%|██████    | 12/20 [00:32<00:20,  2.58s/it]

LLM Output: {
    "relevant_sample": false,
    "relevant_sample_explanation": "The user shares a personal experience of using ChatGPT, but it's primarily for work and building/training bots, not specifically for mental health, emotional support, or coaching. Although the post is in the ADHD subreddit, the focus is on the user's enthusiasm for ChatGPT rather than using it for ADHD management or support."
}


Processing items:  65%|██████▌   | 13/20 [00:35<00:19,  2.79s/it]

LLM Output: {
    "relevant_sample": true,
    "relevant_sample_explanation": "The user shares their personal experience and struggles with ADHD, specifically describing their difficulties with executive functioning skills and their desire for an AI assistant to support them. They also mention trying to use chat GPT but facing limitations, demonstrating a clear first-hand experience with AI tools for mental health and support."
}


Processing items:  70%|███████   | 14/20 [00:37<00:14,  2.46s/it]

LLM Output: {"relevant_sample": false, "relevant_sample_explanation": "The post discusses scientific research on the genetic basis of ADHD but does not share a personal experience of using AI for mental health, coaching, or emotional support."}


Processing items:  75%|███████▌  | 15/20 [00:40<00:13,  2.74s/it]

LLM Output: {
    "relevant_sample": true,
    "relevant_sample_explanation": "The user explicitly describes a personal experience of using ChatGPT as a tool for managing their emotions and regulating their ADHD, sharing a specific 'hack' they discovered and how it has helped them in their daily life, including handling conflicts and breaking emotional spirals."
}


Processing items:  80%|████████  | 16/20 [00:43<00:11,  2.96s/it]

LLM Output: {
    "relevant_sample": true,
    "relevant_sample_explanation": "The user explicitly describes their personal experience of using chatGPT to help organize their thoughts, specifically mentioning how it aids their 'scatter-brain' and helps with breaking down ideas into actionable steps, which is a clear first-hand experience of using AI for mental health and productivity support."
}


Processing items:  85%|████████▌ | 17/20 [00:46<00:08,  2.99s/it]

LLM Output: {
    "relevant_sample": true,
    "relevant_sample_explanation": "The user shares a personal experience of using ChatGPT for roleplay related to psychology, which helped them create a plan to tackle one of their phobias and improve their daily planning. This demonstrates a clear, first-hand experience of using AI for mental health and wellbeing."
}


Processing items:  90%|█████████ | 18/20 [00:50<00:06,  3.16s/it]

LLM Output: {"relevant_sample": false, "relevant_sample_explanation": "The user is seeking a tool to help with writing math equations due to the repetitive nature of the task, which they find tedious. While the post is from the ADHD subreddit, the user does not explicitly describe a personal experience of using AI for mental health, coaching, or emotional support, but rather seeks assistance with a specific task."}


Processing items:  95%|█████████▌| 19/20 [00:55<00:03,  3.63s/it]

LLM Output: {
    "relevant_sample": false,
    "relevant_sample_explanation": "The user is seeking advice and recommendations for apps or AI resources to help with task completion and organization due to ADHD, but does not share a personal experience of using AI for mental health, coaching, or emotional support. They mention a hypothetical interest in something like ChatGPT that automates tasks, but this is a request for information rather than a description of their own experience."
}


Processing items: 100%|██████████| 20/20 [00:58<00:00,  2.93s/it]


LLM Output: {"relevant_sample": false, "relevant_sample_explanation": "The user is asking for advice on using AI to generate an informational PDF for a trip, but does not share a personal experience of using AI for mental health, coaching, or emotional support. The post is about using AI for a practical task, not related to mental health or wellness."}
Filtered to 6 items

Step 2/2


Processing items:  17%|█▋        | 1/6 [00:11<00:58, 11.65s/it]

LLM Output: {
    "overall_sentiment": 9,
    "benefits": ["increased self-esteem", "improved confidence", "better emotional regulation"],
    "downsides": null,
    "specific_use_cases": ["CBT", "reframing negative thoughts", "building confidence", "journaling"],
    "mentioned_conditions": ["low self-esteem", "depression", "anxiety"],
    "provider_status": "previously seen provider",
    "issues_with_human_providers": ["slow progress in therapy"],
    "analysis_notes": "The user credits Chat GPT with helping them identify and overcome negative thought patterns, build confidence, and develop a more positive self-image. They express frustration with slow progress in traditional therapy, but found success with AI-powered support."
}


Processing items:  33%|███▎      | 2/6 [00:24<00:50, 12.63s/it]

LLM Output: {
    "overall_sentiment": 9,
    "benefits": ["good advice", "new perspectives", "feeling understood", "progress with mental health"],
    "downsides": ["initial skepticism"],
    "specific_use_cases": ["venting", "journaling", "problem-solving"],
    "mentioned_conditions": ["broken mind", "being stuck"],
    "provider_status": "previously seen provider",
    "issues_with_human_providers": ["didn't work for me"],
    "analysis_notes": "The user is surprised by the effectiveness of using ChatGPT for mental health support, finding it to provide good advice and new perspectives. They feel understood by the AI, which is helping them make progress with their mental health."
}


Processing items:  50%|█████     | 3/6 [00:35<00:34, 11.64s/it]

LLM Output: {
    "overall_sentiment": 8,
    "benefits": ["organization", "reminders", "prioritization"],
    "downsides": ["ineffective training", "lack of memory"],
    "specific_use_cases": ["executive functioning support", "task management"],
    "conditions": ["ADHD"],
    "provider_status": "not mentioned",
    "issues_with_human_providers": "not mentioned",
    "analysis_notes": "The user is seeking an AI assistant to help with executive functioning skills, specifically with task management, prioritization, and reminders, due to struggles with ADHD."
}


Processing items:  67%|██████▋   | 4/6 [00:42<00:19,  9.65s/it]

LLM Output: {
    "overall_sentiment": 9,
    "benefits": ["emotional regulation", "safe space for venting", "reality check", "practical suggestions"],
    "downsides": null,
    "specific_use_cases": ["venting", "journaling", "emotional regulation"],
    "mentioned_conditions": ["RSD", "ADHD"],
    "provider_status": null,
    "issues_with_human_providers": null,
    "analysis_notes": "User finds ChatGPT helpful for managing emotions and regulating responses during conflicts, using a creative 'hack' of writing fake AITA posts."
}


Processing items:  83%|████████▎ | 5/6 [00:51<00:09,  9.51s/it]

LLM Output: {
    "overall_sentiment": 9,
    "benefits": ["organization", "clarification", "actionable steps", "progress tracking", "memory aid"],
    "downsides": null,
    "specific_use_cases": ["brainstorming", "idea organization", "task planning", "journaling"],
    "mentioned_conditions": ["scatter-brain"],
    "provider_status": null,
    "issues_with_human_providers": null,
    "analysis_notes": "The user utilizes chatGPT to organize their thoughts, turning ideas into actionable steps, and tracking progress. This helps alleviate feelings of lack of accomplishment and provides a memory aid for future reference."
}


Processing items: 100%|██████████| 6/6 [01:03<00:00, 10.59s/it]

LLM Output: {
    "overall_sentiment": 9,
    "benefits": ["improved daily planning", "detailed plan to tackle phobia"],
    "downsides": null,
    "specific_use_cases": ["roleplay", "planning", "phobia management"],
    "mentioned_conditions": ["phobia"],
    "provider_status": null,
    "issues_with_human_providers": null,
    "analysis_notes": "User found AI tool (ChatGPT) helpful for managing phobia and improving daily planning, in conjunction with another tool (goblin.tools)."
}





In [8]:
pipeline.steps[1][0].schema

{'sentiment': {'type': 'integer',
  'description': 'Sentiment towards AI for mental health (1-10, 10 most positive)',
  'items': None,
  'enum': None},
 'benefits': {'type': 'array',
  'description': 'Keywords for benefits: non_judgemental, affordable, accessible, etc',
  'items': 'string',
  'enum': None},
 'downsides': {'type': 'array',
  'description': 'Keywords for downsides: repetitive, shallow, unreliable, etc',
  'items': 'string',
  'enum': None},
 'use_cases': {'type': 'array',
  'description': 'How AI is used: reflection, venting, CBT, journaling, etc',
  'items': 'string',
  'enum': None},
 'conditions': {'type': 'array',
  'description': 'Conditions mentioned: ADHD, depression, anxiety, etc',
  'items': 'string',
  'enum': None},
 'seeing_provider': {'type': 'boolean',
  'description': 'Currently seeing a mental health provider (True or False)',
  'items': None,
  'enum': None},
 'previous_provider': {'type': 'boolean',
  'description': 'Has previously seen a mental health 

In [11]:
results_df = pd.DataFrame(results)
results_df.head()


Unnamed: 0,id,text,title,context_text,url,subreddit,score,created_utc,is_comment,comment_id,...,relevant_sample_explanation,overall_sentiment,benefits,downsides,specific_use_cases,mentioned_conditions,provider_status,issues_with_human_providers,analysis_notes,conditions
0,1fajq7r_post,I've been desperately trying to figure out wha...,Chat GPT Transforms My Mental Health In 2 Weeks,,https://reddit.com/r/ChatGPT/comments/1fajq7r/...,ChatGPT,775,2024-09-06 16:40:03,False,,...,The user explicitly describes their personal e...,9,"[increased self-esteem, improved confidence, b...",,"[CBT, reframing negative thoughts, building co...","[low self-esteem, depression, anxiety]",previously seen provider,[slow progress in therapy],The user credits Chat GPT with helping them id...,
1,1gmmujy_post,Hey!\n\nYou probably heard about people using ...,Using ChatGPT as a tool to improve your mental...,,https://reddit.com/r/DecidingToBeBetter/commen...,DecidingToBeBetter,756,2024-11-08 16:40:52,False,,...,"The user shares a clear, first-hand experience...",9,"[good advice, new perspectives, feeling unders...",[initial skepticism],"[venting, journaling, problem-solving]","[broken mind, being stuck]",previously seen provider,[didn't work for me],The user is surprised by the effectiveness of ...,
2,1bhxtuf_post,"For me specifically, I am looking for somethin...",What are the best ai assistants or ai tools fo...,,https://reddit.com/r/ADHD/comments/1bhxtuf/wha...,ADHD,24,2024-03-18 18:23:40,False,,...,The user shares their personal experience and ...,8,"[organization, reminders, prioritization]","[ineffective training, lack of memory]","[executive functioning support, task management]",,not mentioned,not mentioned,The user is seeking an AI assistant to help wi...,[ADHD]
3,1hjkwj8_post,I feel like my RSD plays a huge role in my rel...,ChatGPT Hack,,https://reddit.com/r/ADHD/comments/1hjkwj8/cha...,ADHD,11,2024-12-21 22:31:38,False,,...,The user explicitly describes a personal exper...,9,"[emotional regulation, safe space for venting,...",,"[venting, journaling, emotional regulation]","[RSD, ADHD]",,,User finds ChatGPT helpful for managing emotio...,
4,1h2iv07_post,This is from a comment I made on some tech pos...,Using “AI” to organize my scatter Brain thoughts,,https://reddit.com/r/ADHD/comments/1h2iv07/usi...,ADHD,0,2024-11-29 10:22:13,False,,...,The user explicitly describes their personal e...,9,"[organization, clarification, actionable steps...",,"[brainstorming, idea organization, task planni...",[scatter-brain],,,The user utilizes chatGPT to organize their th...,


# Old

In [7]:
CONFIG_TAG = "ai_therapy"
CONFIG_FILE = f"configs/{CONFIG_TAG}.toml"

In [8]:
import toml
import pandas as pd
from openai import OpenAI
from helpers.datasource import RedditDataSource, HackerNewsDataSource

from helpers.processor import Processor, FILTER_PROMPT
import helpers.visualizer as visualizer

secrets = toml.load("secrets.toml")
config = toml.load(CONFIG_FILE)

ANALYSIS_USE_CASE = config["use_case"]
FIELDS = config["fields"]

DEEPINFRA_CONFIG = secrets.get("deepinfra", {})
DEEPINFRA_LLM = OpenAI(
  api_key=DEEPINFRA_CONFIG.get("api_key"),
  base_url=DEEPINFRA_CONFIG.get("base_url"), 
)

OPENAI_CONFIG = secrets.get("openai", {})
OPENAI_LLM = OpenAI(
  api_key=OPENAI_CONFIG.get("api_key"),
  base_url=OPENAI_CONFIG.get("base_url"), 
)

processor = Processor(
    use_case_description=ANALYSIS_USE_CASE,
    filter_prompt=FILTER_PROMPT,
    extraction_schema=FIELDS,
    filter_llm_client=DEEPINFRA_LLM,
    filter_model="meta-llama/Llama-3.3-70B-Instruct-Turbo",  # Cheaper model for filtering
    extract_llm_client=DEEPINFRA_LLM,
    extract_model="meta-llama/Meta-Llama-3.1-405B-Instruct",  # More accurate model for extraction
)

In [3]:
data_sources = []

for ds_conf in config["data_sources"]:
    ds_type = ds_conf.pop("type", None)
    if ds_type == "reddit":
        data_sources.append(RedditDataSource(**ds_conf))
    elif ds_type == "hackernews":
        data_sources.append(HackerNewsDataSource(**ds_conf))
    else:
        print(f"Unknown data source type: {ds_type}")

In [4]:
dfs = [source.get_data() for source in data_sources]
df = pd.concat(dfs, ignore_index=True).drop_duplicates(subset='id')

In [None]:
# df_filtered = processor.filter_data(df.sample(n=100, random_state=42))  # For testing with a small random sample
df_filtered = processor.filter_data(df)  # 1st level filtering
df_extracted = processor.extract_fields(df_filtered)  # Extraction of structured data
samples = df_extracted[df_extracted['relevant_sample'] == True]  # 2nd level filtering
print(f"Samples: initial={len(df)}, after 1st filter={len(df_filtered[df_filtered['is_relevant']==True])}, final={len(samples)}")

In [None]:
# # Backup dataframes to output files
# df.to_json(f"out/{CONFIG_TAG}.json", orient='records', date_format='iso')
# df_filtered.to_json(f"out/{CONFIG_TAG}-filtered.json", orient='records', date_format='iso') 
# samples.to_json(f"out/{CONFIG_TAG}-samples.json", orient='records', date_format='iso')

In [None]:
# Simple plots are created automatically by vizualizer
viz = visualizer.Visualization(samples, FIELDS)
viz.plot_all_fields(show_examples=True)
viz.show_samples(n=3, extra_fields=['sentiment','use_cases'])
viz.plot_by_time('created_utc', "Posts Over Time")
# viz.plot_group_comparison('subreddit', 'sentiment', agg='mean')
# viz.plot_correlation(['sentiment'])