In [1]:
import sys
sys.path.append('.')  # Add current dir to path

from agno.agent import Agent
from agno.models.google import Gemini
from agno.tools.thinking import ThinkingTools
from agno.tools.reasoning import ReasoningTools
from app import df
from Agents import query_gen, filter_agent, coder_agent, executor_agent, analyzer_agent, report_agent, exec_python

  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 631/631 [01:45<00:00,  5.99it/s]
100%|██████████| 41/41 [00:54<00:00,  1.32s/it]


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


In [2]:
query_eval_agent = Agent(
    name="QueryEvaluator",
    role="Evaluate the quality of structured queries",
instructions="""
You are a query evaluation agent.

You will receive:
1. A user goal in natural language (e.g. a business question).
2. A structured query (in JSON-like format) generated from that goal.

Your task is to assess how well the structured query captures the user's intent.

Specifically, check for:
- Intent coverage: Does the query fully reflect what the user wants?
- Filtering correctness: Are the filters relevant and correct?
- Metrics relevance: Are the metrics appropriate for the goal?
- Clarity: Can a downstream agent easily understand and use the query?

Your response must be a valid JSON object like:

{
  "score": <integer from 1 to 10>,
  "justification": "<specific explanation of what works, what's missing or wrong, and how to improve it>"
}

Scoring guidelines:
- 9-10: Near perfect — fully aligned, usable query
- 6-8: Good effort but needs adjustment (e.g. wrong filter, vague metric)
- 3-5: Significant gaps in filters or logic
- 1-2: Query misses or misrepresents the goal
"""
,
    model=Gemini(id="gemini-2.5-flash", temperature=0.1)
)


In [3]:
filter_eval_agent = Agent(
    name="FilterEvaluator",
    role="Evaluate the accuracy of the filtered comments",
    instructions="""
You are an evaluation agent.

You will receive:
1. A list of reference comments (ground truth — what should have been retrieved).
2. A list of filtered comments (what the filtering agent actually retrieved).

Your task:
- Compare the two sets and identify the number of correctly retrieved comments.
- A comment is considered "correctly retrieved" if it exactly matches a reference comment or is semantically very similar (minor differences in spelling or punctuation are okay).
- Do not count partial matches or irrelevant content.

Then compute the accuracy using this formula:
(Number of correctly retrieved comments) / (Total number of reference comments)

Respond with **only** the accuracy as a float between 0 and 1 (e.g., 0.83).
Do not explain or add anything else.
""",
model = Gemini("gemini-2.5-flash", temperature=0.1)
)

In [4]:
def run_stat(structured_query, filtered_data):
    code = coder_agent.run("query : \n" + structured_query + "\n\n comments: " + filtered_data).content
    return executor_agent.run(code).content

In [5]:
analysis_eval = Agent(name = "AnalysisEvaluator", role = "Evaluate an analysis agent work", instructions= 
                                        """
You are a fact-checker reviewing whether an analysis is truly based on the input data.

You will receive:
- A business question
- A list of comments (comment_1, comment_2, etc.)
- A structured analysis generated from those comments

Your task:
1. Break the analysis down into individual claims (statements of insight, statistics, summaries).
2. For each claim, check whether it is clearly supported by one or more of the comments.
3. Mark each claim as "Supported" or "Not Supported".
4. If anything is invented or vague, flag it as a hallucination.
5. At the end, report the % of supported claims.

Respond in this format:

Evaluation:
- Claims Checked: N
- Supported Claims: X
- Unsupported/Hallucinated Claims: Y
- Grounding Score: (X / N * 100)%
- Notes: [Optional explanations for unsupported claims]
""",
                      
model = Gemini(id = "gemini-2.5-flash", temperature= 0.1))

In [6]:
report_eval = Agent(name = "ReportEvaluator", role = "Evaluate a generated report", instructions="""
You are a strict reviewer verifying if a report is faithful to a prior structured analysis.

You will receive:
- A structured analysis (with sections like Summary, Patterns, Quotes, etc.)
- A final report generated from it

Your job:
1. Read the report carefully and break it into individual claims or sentences.
2. Check if each claim exists in the analysis or is clearly inferred from it.
3. Flag any hallucinated claims (not present or justified in the analysis).
4. Check for missing sections or overlooked insights.
5. Give a final binary faithfulness score.

Respond in this format:

Evaluation:
- Claims in Report: N
- Supported by Analysis: X
- Unsupported / Hallucinated: Y
- Faithfulness Score: (X / N * 100)%
- Missing Insights: [List any key points the report left out]
- Notes: [Optional observations]
""",
model = Gemini("gemini-2.5-flash"))

## Tests :

In [7]:
query = "understand what people say about the taste and give me some numbers"

comments = df["text"].sample(20, random_state= 40)

### Query eval :

In [8]:
structured_query = query_gen.run(query).content

In [9]:
structured_query

'```json\n{\n  "intent": "Analyze public feedback and sentiment specifically related to the \'taste\' aspect of a product or service.",\n  "filters": {\n    "keywords": ["taste", "flavor", "delicious", "bland", "sweet", "sour", "bitter", "spicy", "savory", "tasty", "disgusting", "palate", "aftertaste"]\n  },\n  "metrics": [\n    "Overall sentiment distribution (positive, negative, neutral) regarding taste",\n    "Frequency of specific taste-related keywords and descriptors",\n    "Identification of common themes or recurring feedback points about taste",\n    "Average rating or satisfaction score related to taste (if applicable data exists)"\n  ],\n  "depth": "Detailed statistical breakdown and summary of taste-related feedback.",\n  "comparisons": "None",\n  "output_format": "Summary report with key findings, sentiment percentages, and frequency counts of taste descriptors."\n}\n```'

In [10]:
query_eval_agent.run("query: " + query + "generated query : "+ structured_query).content

'```json\n{\n  "score": 10,\n  "justification": "The query perfectly captures the user\'s intent. It clearly defines the focus on \'taste\' through relevant keywords and specifies the numerical outputs required, such as sentiment distribution, keyword frequency, and average ratings. The metrics are highly relevant for understanding public feedback and providing actionable numbers. The structure is clear and comprehensive."\n}\n```'

### Filter eval

In [11]:
comment = "\n".join(comments)

In [12]:
filter_agent = Agent(
    name="DataFilter",
    role="filter the dataset using semantic understanding of the structured query",
    tools = [ThinkingTools(think = True, add_instructions= True)],
    instructions="""
You are a semantic filter agent with access to a dataset of social media comments.

You will receive:
1. A structured query (JSON-like string) from the previous agent.
2. A string of comments or a reference to the dataset.

Your task:
1. Parse the structured query.
2. Use the filters in the query (e.g. sentiment, keyword, product, language) to select relevant comments.
3. Use semantic understanding, not just keyword matching. If sentiment is mentioned, try to estimate sentiment from the comment content.
4. Return ONLY the relevant comments as a string:
   - Format: "Comment 1: <text>, Comment 2: <text>, ..."

If the query is too vague or no comments match, return:
   "No comments matched the criteria."
Even if some comments are not perfect matches, return the closest relevant ones.
Never return an empty string unless the query is impossible to process.
""",
    model=Gemini(id="gemini-2.5-flash", temperature= 0.1)
)   


In [13]:
comments

12978                            wayli o lbid lbldi tahowa
6944                            dury basbousa dyal bortoal
1716     خديتو ومداقو مزوينش ومطحون بزاااف بحال دقيق وز...
7597     شحال كلينا فيه فين أيام زمان عشقنا شحال من حاج...
15009                                          تكيفاش خفيف
10148    كاع هاد الروينة ومافيك اللي يطيب انا كناخد مطي...
18782    مايديرلو والو انا راني مزال كنوكلو لبنتي حيت ض...
5613     jai réalisé le gâteau hier franchement il étai...
20121    واقيلة انا وحيدة لي معنديش مع هاد شي ديال سقاط...
4328                                     okey et merendina
7980     كناخد الكحلة كتعجبني مديريش السشية كاملة ديال ...
15102    yassine yassine hda howa wajba man bad hisa dy...
4487                                  hawai it used tu bee
19662                            كاسكروط مع الحباب والصحاب
4020     لوكو هههه كنت نمشي عند مول لحانوت نقوليه عطيني...
11015                                     شنو سميت الفرماج
3534                                   kiwi w marie w mi

In [14]:
ref_data = "comment 1 : " + comments.iloc[2] + "comment 2: " + comments.iloc[3] +"comment 3 : " + comments.iloc[6] + "comment 4: " + comments.iloc[7] +"comment 5: " + comments.iloc[8] + "comment 5: " + comments.iloc[10] + "comment 6: " + comments.iloc[17]

filtered_data = filter_agent.run("query :\n" + structured_query+ "\n\ncomments : "+ comment).content

In [15]:
filtered_data

'Comment 1: خديتو ومداقو مزوينش ومطحون بزاااف بحال دقيق وزيد فيه ريحة معجباتنيش بحال نكهات لكنديرو فلحلوة وانا دابا قطعتهوم بجوج مبقيتش كناخدهم, Comment 2: شحال كلينا فيه فين أيام زمان عشقنا شحال من حاجة شتقناها وذاك الحرشة فالفران ديال الزنقة حتى هي مقطعة غير عشوائية طابية وشحال لذيدة اللع يحفظ جميع الميمات ويطول في عمرهم ويرحم من فارق الحياة يارب, Comment 3: jai réalisé le gâteau hier franchement il était trop bon, Comment 4: كناخد الكحلة كتعجبني مديريش السشية كاملة ديال الحار غير شوية صافي اول مرة درتها كاملة كناكل ونبكي'

In [16]:
filter_eval_agent.run("reference : \n" + ref_data + "\n\n\nfiltered data \n" + filtered_data).content

'0.57'

## Code Eval

In [17]:
code = coder_agent.run("query : " + structured_query + "comments : " + filtered_data).content

In [18]:
execution = executor_agent.run(code).content

In [19]:
execution

'--- Taste Feedback Analysis ---\n\nOverall Sentiment Distribution:\n  Positive: 25.00%\n  Negative: 50.00%\n  Neutral: 25.00%\n\nFrequency of Taste-Related Keywords:\n  الحار: 1\n  نكهات: 1\n  ريحة: 1\n  aftertaste: 0\n  bitter: 0\n  taste: 0\n  spicy: 0\n  sweet: 0\n  مذاق: 0\n  flavor: 0\n  delicious: 0\n  sour: 0\n  disgusting: 0\n  savory: 0\n  tasty: 0\n  bland: 0\n  palate: 0\n'

In [20]:
filtered_data

'Comment 1: خديتو ومداقو مزوينش ومطحون بزاااف بحال دقيق وزيد فيه ريحة معجباتنيش بحال نكهات لكنديرو فلحلوة وانا دابا قطعتهوم بجوج مبقيتش كناخدهم, Comment 2: شحال كلينا فيه فين أيام زمان عشقنا شحال من حاجة شتقناها وذاك الحرشة فالفران ديال الزنقة حتى هي مقطعة غير عشوائية طابية وشحال لذيدة اللع يحفظ جميع الميمات ويطول في عمرهم ويرحم من فارق الحياة يارب, Comment 3: jai réalisé le gâteau hier franchement il était trop bon, Comment 4: كناخد الكحلة كتعجبني مديريش السشية كاملة ديال الحار غير شوية صافي اول مرة درتها كاملة كناكل ونبكي'

## Analysis eval :

In [21]:
analysis = analyzer_agent.run("query : " + structured_query +"\n\ncomments : " + filtered_data + "\n\nstatistics: " + execution).content
analysis_eval.run("data : " + filtered_data + "analysis :" + analysis).content

'Evaluation:\n- Claims Checked: 20\n- Supported Claims: 18\n- Unsupported/Hallucinated Claims: 2\n- Grounding Score: 90%\n- Notes:\n    - **Claim 19 (Unsupported/Hallucinated):** The analysis states, "The "Overall Sentiment Distribution" provided in the input statistics (Positive: 25%, Negative: 50%, Neutral: 25%) does not align with the sentiment derived directly from the four comments..." There were no "input statistics" provided in the prompt. The analysis correctly calculated the sentiment from the comments, but it hallucinated the existence of contradictory "input statistics."\n    - **Claim 20 (Unsupported/Hallucinated):** The analysis states, "The provided list of keywords for frequency analysis did not include all semantically relevant taste descriptors found in the comments..." There was no "provided list of keywords for frequency analysis" in the prompt. The analysis correctly identified relevant keywords, but it hallucinated the existence of an external, incomplete list.'

### Report Eval :

In [22]:
report = report_agent.run(analysis).content

report_eval.run("analysis : " + analysis + "report : " + report).content

'Evaluation:\n- Claims in Report: 44\n- Supported by Analysis: 44\n- Unsupported / Hallucinated: 0\n- Faithfulness Score: (44 / 44 * 100)% = 100%\n- Missing Insights: None\n- Notes:\n    *   The report faithfully extracts and structures all information provided in the analysis.\n    *   It logically infers additional details (e.g., language of quotes, implications of limitations) which enhance clarity without hallucinating.\n    *   The "Recommendations" section is a new addition in terms of report structure, but all claims within this section are direct and sound inferences based on the "Uncertainties or Gaps" identified in the analysis. This demonstrates intelligent synthesis rather than unsupported content.'

In [23]:
queries = ["what do people think about the products and give me some numbers",
           "understand why people don't like our products",
            "What are the most discussed Moroccan snacks online?",
            "Which Moroccan snack brands get the best feedback?",
            "Summarize what people say about buying snacks from hanout",
            "Are people complaining about snack prices this summer?",
            "List popular Moroccan snacks mentioned during school season",
            "Generate a report on how people feel about snack quality",
            "Highlight viral posts about traditional Moroccan biscuits",
            "Which snacks are seen as unhealthy by Moroccan users?",
            "Do people prefer imported snacks or local ones?",
            "What are the snack trends during Eid and Ramadan?"]

In [24]:
import pandas as pd

test_set = pd.Series(queries)

In [25]:
for query in test_set:
    structured_query = query_gen.run(query).content
    print(query_eval_agent.run("query: " + query + "structured_query :" + structured_query ).content)

```json
{
  "score": 10,
  "justification": "The structured query perfectly captures the user's intent. 'What do people think about the products' is addressed by the sentiment distribution, average sentiment score, and top recurring themes for both positive and negative feedback. 'Give me some numbers' is covered by the total volume of feedback and the percentages in the sentiment distribution. The filters are appropriately broad given no specific constraints were provided, and the output format is clear and relevant."
}
```
```json
{
  "score": 10,
  "justification": "The structured query perfectly captures the user's intent to understand the reasons behind negative product sentiment. The intent, filters, metrics, depth, and output format are all highly relevant and well-defined to address the 'why' aspect of the user's goal. It's a comprehensive and actionable query."
}
```
```json
{
  "score": 8,
  "justification": "The query effectively captures the user's intent to identify the mo

ModelProviderError: <Response [429 Too Many Requests]>