In [1]:
CONFIG_FILE = "configs/ivf.toml"

In [2]:
import toml
import pandas as pd
from openai import OpenAI
from datasource import RedditDataSource
from processor import Processor, FILTER_PROMPT
import visualizer
from importlib import reload

config = toml.load(CONFIG_FILE)
ANALYSIS_USE_CASE = config["use_case"]
FIELDS = config["fields"]

secrets = toml.load("secrets.toml")

DEEPINFRA = "deepinfra"
DEEPINFRA_LLM = OpenAI(
  api_key=secrets.get(DEEPINFRA, {}).get("api_key"),
  base_url=secrets.get(DEEPINFRA, {}).get("base_url"), 
)

OPENAI = "openai"
OPENAI_LLM = OpenAI(
  api_key=secrets.get(OPENAI, {}).get("api_key"),
  base_url=secrets.get(OPENAI, {}).get("base_url"), 
)

processor = Processor(
    use_case_description=ANALYSIS_USE_CASE,
    filter_prompt=FILTER_PROMPT,
    extraction_schema=FIELDS,
    filter_llm_client=DEEPINFRA_LLM,
    extract_llm_client=DEEPINFRA_LLM,
    filter_model="meta-llama/Llama-3.3-70B-Instruct-Turbo",  # Cheaper model for filtering
    extract_model="meta-llama/Llama-3.3-70B-Instruct-Turbo",      # More accurate model for extraction
)

In [3]:
data_sources = []
for ds_conf in config["data_sources"]:
    ds_type = ds_conf.pop("type", None)
    if ds_type == "reddit":
        data_sources.append(RedditDataSource(**ds_conf))
    else:
        print(f"Unknown data source type: {ds_type}")

In [4]:
dfs = [source.get_data() for source in data_sources]
df = pd.concat(dfs, ignore_index=True).drop_duplicates(subset='id')

In [5]:
# df_filtered = processor.filter_data(df.sample(n=100, random_state=42))  # For testing only
df_filtered = processor.filter_data(df)  # 1st level filtering
df_extracted = processor.extract_fields(df_filtered)  # Extraction of structured data
samples = df_extracted[df_extracted['relevant_sample'] == True]  # 2nd level filtering
print(f"Samples: initial={len(df)}, after 1st filter={len(df_filtered[df_filtered['is_relevant']==True])}, final={len(samples)}")

Filtering data: 100%|██████████| 671/671 [04:14<00:00,  2.63it/s]
Extracting fields:  67%|██████▋   | 18/27 [00:11<00:05,  1.60it/s]


RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-JBqEOvyYgf1t2QEfnjRxwGhu on tokens per min (TPM): Limit 30000, Used 29463, Requested 1165. Please try again in 1.256s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}

In [6]:

reload(visualizer)
viz = visualizer.Visualization(samples, FIELDS)
viz.plot_all_fields(show_examples=True)
viz.show_samples(n=3, extra_fields=['sentiment','use_cases'])
viz.plot_by_time('created_utc', "Posts Over Time")
viz.plot_group_comparison('subreddit', 'sentiment', agg='mean')
viz.plot_correlation(['sentiment'])  

No data for 'relevant_sample' to plot binary distribution.
No valid data for 'baseline_eggs' to plot integer distribution.
No valid data for 'baseline_embryos' to plot integer distribution.
No valid data for 'new_eggs' to plot integer distribution.
No valid data for 'new_embryos' to plot integer distribution.
No list items found for 'interventions'.
No list items found for 'conditions'.
No valid data for 'sentiment' to plot integer distribution.
No list items found for 'advice'.
No samples to display.
No valid datetime values in created_utc to plot over time.
No data after aggregation mean for sentiment by subreddit.
No valid numeric data for correlation.
