In [1]:
import os
import re
import yaml
import json
import ast
import praw
from crewai import Agent, Task, Crew, Process
from langchain_openai import ChatOpenAI

from textwrap import dedent
from reddit_helper import *    

from langchain.llms import OpenAI, Ollama
from langchain_openai import ChatOpenAI

from pydantic import BaseModel
from datetime import datetime, timedelta

* 'allow_population_by_field_name' has been renamed to 'populate_by_name'
* 'smart_union' has been removed


In [2]:
api_file_path = 'api_keys.json'
with open(api_file_path, 'r') as file:
    api_keys = json.load(file)        
openai_gpt35 = ChatOpenAI(model_name="gpt-3.5-turbo", 
                                temperature=0.7,
                                openai_api_key=api_keys['openai'])
openai_gpt4 = ChatOpenAI(model_name="gpt-4", 
                                temperature=0.7,
                                openai_api_key=api_keys['openai'])        
llama31_8b = Ollama(model='llama3.1:8b',)
gemma2_9b = Ollama(model='gemma2:9b',)

In [3]:
cfg_file_path = 'casaai_config.yaml'
agent_cfg_path = 'agents.yaml'
task_cfg_path = 'tasks.yaml'

with open(cfg_file_path, 'r') as yaml_file:
    cfg = yaml.safe_load(yaml_file)
product_long = cfg.get('product_long_description', '')      
product_short = cfg.get('product_short_description', '')
product_name = cfg.get('product_short_description', '')
domain = cfg.get('domain')
broad_keywords = cfg.get('broad_keywords', [])
output_format_1 = cfg.get('output_format_1')

with open(agent_cfg_path, 'r') as yaml_file:
    agent_cfg_data = yaml.safe_load(yaml_file)

with open(task_cfg_path, 'r') as yaml_file:
    task_cfg_data = yaml.safe_load(yaml_file)

In [4]:
tip_text = "If you do your BEST WORK, I'll give you a $100 commission!"

In [34]:
def re_decision_string(input_string):

    pattern = r"'comment_id':\s*'([^']*)',\s*'decision':\s*(\d+)"

    # Use re.findall to extract the values
    matches = re.findall(pattern, input_string)

    if matches:
        comment_id, decision = matches[0]
    else:
        comment_id = "na"
        decision = "discard"

    if "justification" in input_string:
        justification = input_string.split("justification")[-1]
    else:
        justification = "Not available"
    
    return comment_id, decision, justification

In [35]:
def re_score_string(input_string):

    pattern = r"'comment_id':\s*'([^']*)',\s*'score':\s*(\d+)"

    # Use re.findall to extract the values
    matches = re.findall(pattern, input_string)

    if matches:
        comment_id, score = matches[0]
    else:
        comment_id = "na"
        score = 0

    if "justification" in input_string:
        justification = input_string.split("justification")[-1]
    else:
        justification = "Not available"
    
    return comment_id, score, justification

In [36]:
backstory = agent_cfg_data['content_filter_analyst']['backstory'] 
goal = agent_cfg_data['content_filter_analyst']['goal']
role = agent_cfg_data['content_filter_analyst']['role'] 
content_filter_analyst = Agent(
                            role=role,
                            goal=goal,
                            backstory=backstory,
                            allow_delegation=False,
                            verbose=False,
                            llm=gemma2_9b,
                            )

In [37]:
class DecisionOutput(BaseModel):
    comment_id: str
    decision: str
    justification: str

In [38]:
description = task_cfg_data['content_filter_task']['description']
expected_out = task_cfg_data['content_filter_task']['expected_out'] 
        
content_filter_task = Task(
                            description=description,
                            expected_output=expected_out,
                            output_json=DecisionOutput,
                            agent=content_filter_analyst,
                            )

In [39]:
response_creation_crew = Crew(
    agents=[content_filter_analyst,],
    tasks=[content_filter_task,],
    verbose=False,
)



In [11]:
reddit_posts, reddit_post_ids = fetch_reddit_test()
condensed_reddit_data, unique_post_ids, unique_comment_ids = condense_data(reddit_posts, reddit_post_ids)

post_cnt: 0 + 19 = 19
comm_cnt: 0 + 692 = 692
cond_cnt: 711 = 19 + 692
Cross_ck: 19 = 19


In [18]:
comment_dict = {}
tot_data = 0
for idx1, item in enumerate(condensed_reddit_data):
    for idx2, data in enumerate(item):
        tot_data += 1
        c_id = data['comment_id']
        p_id = data['parent_id']
        text = data['text']
        age = data['age']
        comment_dict[c_id] = {'comment_id': c_id, 'parent_id': p_id, 'text': text, 'age': age}
    print(idx1, ':', idx2)
comment_dict_list = list(comment_dict.keys())
print(len(comment_dict), '=', tot_data, len(comment_dict_list))

0 : 169
1 : 108
2 : 31
3 : 57
4 : 0
5 : 33
6 : 60
7 : 67
8 : 1
9 : 34
10 : 0
11 : 1
12 : 79
13 : 0
14 : 7
15 : 2
16 : 0
17 : 9
18 : 34
711 = 711 711


In [19]:
def get_data_details(comm_id):
    done = 0
    data_details = []
    while done==0:
        if comm_id in comment_dict:
            details = comment_dict[comm_id]
            data_details.append(details)
            comm_id = details['parent_id']
        else:
            done = 1
    return data_details

In [None]:
decision_lst = []
for idx, reddit_data_item in enumerate(condensed_reddit_data):
    comment_lst = []
    for idx2, comment_data in enumerate(reddit_data_item):        
        comm_id = comment_data['comment_id']  
        comment_text = comment_data['text']
        print(f'STARTING {idx} - {idx2} - {comm_id}')   
        input_dict = {"comment_id": comm_id,
                      "input_data": comment_text,
                      "product_long": product_long,
                      "product_short": product_short,
                      "domain": domain,
                      "broad_keywords": broad_keywords,
                      "output_format":output_format_1, 
                      "tip_text":tip_text}
        decision_result = response_creation_crew.kickoff(inputs=input_dict)
        try:
            json_out = ast.literal_eval(decision_result.json)
        except:
            print('Exception in json - trying re')
            _, d, j = re_decision_string(decision_result.raw)            
            json_out = {'comm_id': comm_id, 'decision': str(d), 'justification': str(j)}
        comment_lst.append(json_out)
    decision_lst.append(comment_lst)

STARTING 0 - 0 - kgesxn
STARTING 0 - 1 - ggfdsvg
[93m Error parsing JSON: Expecting value: line 1 column 1 (char 0). Attempting to handle partial JSON.[00m
STARTING 0 - 2 - ggfh2zn
STARTING 0 - 3 - ggepn0p
STARTING 0 - 4 - ggfd5lx
[93m Pydantic validation error: 3 validation errors for DecisionOutput
comment_id
  Field required [type=missing, input_value={'ggfd5lx': {'decision': ... words, not relevant.'}}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.8/v/missing
decision
  Field required [type=missing, input_value={'ggfd5lx': {'decision': ... words, not relevant.'}}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.8/v/missing
justification
  Field required [type=missing, input_value={'ggfd5lx': {'decision': ... words, not relevant.'}}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.8/v/missing. Attempting to handle partial JSON.[00m
[93m Pydantic validation error: 3 validation e

In [21]:
analysis_lst = []
bad_list = []
tot_comments = 0
bad_cnt = 0
relevant_json_list = []
discarded_json_list = []
for item in decision_lst:  
    sep = "^"
    for comment in item:
        tot_comments += 1
        try:
            comm_key = list(comment.keys())[0]
            comm_id  = comment[comm_key].strip()
            if comm_id in comment_dict_list:                 
                parent_id = comment_dict[comm_id]["parent_id"]
                if parent_id is None:
                    parent_id = 'NA'
                decision = comment['decision']
                justify  = comment["justification"]
                comment_text = comment_dict[comm_id]["text"]
                text_str = sep + comm_id + sep + parent_id + sep + decision + sep + justify + sep + comment_text + sep
                if comment['decision'] == 'relevant':
                    relevant_json_list.append(comment)
                else:
                    discarded_json_list.append(comment)
                analysis_lst.append(text_str)            
            else: 
                print(f'Comment ID not found : {comm_id}')
                bad_cnt += 1
                bad_list.append(comm_id)
        except:
            print(f'comment key error : {comment}')
            bad_cnt += 1
print(f'relevant : {len(relevant_json_list)} discarded : {len(discarded_json_list)} bad_cnt: {bad_cnt} total: {tot_comments}')

Comment ID not found : string
Comment ID not found : ggg8fu
Comment ID not found : ggggp581
Comment ID not found : ggggbtyv
Comment ID not found : ggevxe
comment key error : {'ggfeg9e': {'decision': 'discard', 'justification': 'Positive sentiment with no follow-up questions.'}}
Comment ID not found : str
Comment ID not found : ggy9lj
Comment ID not found : ggg m3zh
relevant : 55 discarded : 106 bad_cnt: 9 total: 170


In [17]:
relevant_json_list[0]

{'comment_id': 'ggfh2zn',
 'decision': 'relevant',
 'justification': 'Seeks information about design choices in the room.'}

In [18]:
with open('decision_result_v6_0901A.txt', "w") as file:
    # Iterate through the list and write each string to the file
    for item in analysis_lst:
        file.write(item + "\n")  # Add a newline character after each string

print(f"List has been written")

List has been written


In [19]:
with open('decision_result_v6_0901A.txt', "r") as file:
    # Read all lines and strip the newline character from each line
    analysis_lst_loaded_back = [line.strip() for line in file.readlines()]
print(len(analysis_lst_loaded_back))

711


In [32]:
backstory = agent_cfg_data['content_scoring_analyst']['backstory'] 
goal = agent_cfg_data['content_scoring_analyst']['goal']
role = agent_cfg_data['content_scoring_analyst']['role'] 
content_scoring_analyst = Agent(
                            role=role,
                            goal=goal,
                            backstory=backstory,
                            allow_delegation=False,
                            verbose=False,
                            llm=gemma2_9b,
                            )

In [33]:
class ScoreOutput(BaseModel):
    comment_id: str
    score: float
    justification: str

In [34]:
description = task_cfg_data['content_scoring_task']['description']
expected_out = task_cfg_data['content_scoring_task']['expected_out'] 
        
content_scoring_task = Task(
                            description=description,
                            expected_output=expected_out,
                            output_json=ScoreOutput,
                            agent=content_filter_analyst,
                            )

In [35]:
score_creation_crew = Crew(
    agents=[content_scoring_analyst,],
    tasks=[content_scoring_task,],
    verbose=False,
)



In [36]:
score_result_lst = []
for idx, comment_data in enumerate(relevant_json_list):     
    comm_id = comment_data['comment_id']        
    print(f'STARTING {idx} - {comm_id}')
    data_details = get_data_details(comm_id)      
    input_dict = {"comment_id": comm_id,
                  "input_data": data_details,
                  "product_long": product_long,
                  "product_short": product_short,
                  "domain": domain,
                  "product_name": product_name,
                  "tip_text":tip_text}
    scoring_result = score_creation_crew.kickoff(inputs=input_dict)
    try:
        json_out = ast.literal_eval(scoring_result.json)
    except:
        print('Exception in json - trying re')
        _, s, j = re_score_string(scoring_result.raw)            
        json_out = {'comm_id': comm_id, 'score': float(d), 'justification': str(j)}            
    score_result_lst.append(json_out)

STARTING 0 - ggfh2zn
[93m Error parsing JSON: Expecting value: line 1 column 1 (char 0). Attempting to handle partial JSON.[00m
STARTING 1 - ggfd5lx
[93m Error parsing JSON: Expecting value: line 1 column 1 (char 0). Attempting to handle partial JSON.[00m
STARTING 2 - ggewh8g
[93m Error parsing JSON: Expecting value: line 1 column 1 (char 0). Attempting to handle partial JSON.[00m
STARTING 3 - ggfqz4a
[93m Error parsing JSON: Expecting value: line 1 column 1 (char 0). Attempting to handle partial JSON.[00m
STARTING 4 -  ggf874w
[93m Error parsing JSON: Expecting value: line 1 column 1 (char 0). Attempting to handle partial JSON.[00m
STARTING 5 - ggg0jqv
[93m Error parsing JSON: Expecting value: line 1 column 1 (char 0). Attempting to handle partial JSON.[00m
STARTING 6 - ggflytk
[93m Error parsing JSON: Expecting value: line 1 column 1 (char 0). Attempting to handle partial JSON.[00m
STARTING 7 - ggfk0u3
[93m Error parsing JSON: Expecting value: line 1 column 1 (char 0).

In [37]:
score_result_lst[0], len(score_result_lst), len(relevant_json_list)

({'comment_id': 'ggfh2zn',
  'score': 8.5,
  'justification': 'Comment inquires about design choices, hinting at potential interest in transformation advice.'},
 174,
 174)

In [41]:
thresh_list = [7.0, 7.5, 8.0, 8.5, 9.0, 9.5]

for thresh in thresh_list:
    thresh_up = 0
    for item in score_result_lst:
        if item['score'] >= thresh:
            thresh_up += 1
    print(thresh, ':', thresh_up, '/', len(score_result_lst))

7.0 : 78 / 174
7.5 : 67 / 174
8.0 : 24 / 174
8.5 : 11 / 174
9.0 : 2 / 174
9.5 : 0 / 174


In [42]:
analysis_lst = []
bad_list = []
tot_comments = 0
bad_cnt = 0
good_json_list = []
sep = "^"
for comment in score_result_lst:
    tot_comments += 1
    try:
        comm_key = list(comment.keys())[0]
        comm_id  = comment[comm_key].strip()
        
        if comm_id in comment_dict_list:  
            parent_id = comment_dict[comm_id]["parent_id"]
            if parent_id is None:
                parent_id = 'NA'
            score = comment['score']
            justify  = comment["justification"]
            comment_text = comment_dict[comm_id]["text"]
            text_str = sep + comm_id + sep + parent_id + sep + str(score) + sep + justify + sep + comment_text + sep
            analysis_lst.append(text_str) 
            good_json_list.append(comm_id)
        else: 
            print(f'Comment ID not found : {comm_id}')
            bad_cnt += 1
            bad_list.append(comm_id)
    except:
        print(f'comment key error : {comment}')
        bad_cnt += 1
print(f'relevant : {len(good_json_list)} bad_cnt: {bad_cnt} total: {tot_comments}')

relevant : 174 bad_cnt: 0 total: 174


In [43]:
with open('score_result_v5_0827A.txt', "w") as file:
    # Iterate through the list and write each string to the file
    for item in analysis_lst:
        file.write(item + "\n")  # Add a newline character after each string

print(f"List has been written")

List has been written


In [44]:
with open('score_result_v5_0827A.txt', "r") as file:
    # Read all lines and strip the newline character from each line
    analysis_lst_loaded_back = [line.strip() for line in file.readlines()]
print(len(analysis_lst_loaded_back))

222


In [64]:
# For each post-comment combo select the post-comments whose score >= 7

In [None]:
# Then select a threshold like 20% or top 3 whichever is bigger of comments to craft response

In [None]:
# Build response for these comments With Manager LLM + only 1 response writer + meta reviewer