# Workshop notebook 1 - Amuse-Bouche

Before we get into too much detail, here's a taste of deep analytics. You can mostly click through
this notebook, and I won't explain everything that's going on in it, but the broad strokes are these:

1. We'll start by ingesting a bunch of data that I've preprocessed and placed in s3 into Aryn
2. We'll then run a processing job to answer: "List all the companies that mentioned inflation and give me a count of the number of times each of the companies mentioned inflation."

In [None]:
# Imports & setup
import sycamore
from pathlib import Path
from aryn_sdk.client.client import Client

materialize_dir = Path.cwd() / "materialize"
ctx = sycamore.init()

In [None]:
# Create the docset target in Aryn
aryn_client = Client()
aryn_docset = aryn_client.create_docset(name = "haystack-workshop-nb-0")
docset_id = aryn_docset.value.docset_id

In [None]:
# Write the data

(
    ctx.read.materialize(path = materialize_dir / "about-to-ingest")
    .write.aryn(docset_id = docset_id)
)

In [None]:
# Imports and prep for the analytics
from sycamore.llms.openai import OpenAI, OpenAIModels
from sycamore.functions.tokenizer import OpenAITokenizer
from sycamore.llms.prompts.default_prompts import LlmFilterMessagesJinjaPrompt
from sycamore.llms.llms import LLMMode

llm = OpenAI(OpenAIModels.GPT_4O_MINI)
tk = OpenAITokenizer(OpenAIModels.GPT_4O_MINI.value.name)

In [None]:
# Read the data back and run analytics.
count_docs = (
    ctx.read.aryn(docset_id = docset_id)
    .explode()
    .filter(lambda doc: "parent_id" in doc)
    .llm_filter(
        llm=llm,
        new_field="inflation_mentioned_confidence",
        prompt = LlmFilterMessagesJinjaPrompt.fork(filter_question="Does this text mention inflation?"),
        tokenizer = tk,
        max_tokens = 80_000)
    .groupby_count('properties.entity.company_name')
    .take_all()
)

In [None]:
# Display the results
import rich
inflation_table = rich.table.Table(title="inflation_mentions")
inflation_table.add_column("company")
inflation_table.add_column("mentions")

counts = [(d.properties['count'], d.properties['key']) for d in count_docs]
for c, k in sorted(counts):
    inflation_table.add_row(k, str(c))

rich.print(inflation_table)