In [29]:
import os
import re
import yaml
import json
import ast
import praw
from crewai import Agent, Task, Crew, Process
from langchain_openai import ChatOpenAI

from textwrap import dedent
from reddit_helper import *    

from langchain.llms import OpenAI, Ollama
from langchain_openai import ChatOpenAI

from pydantic import BaseModel

In [30]:
api_file_path = 'api_keys.json'
with open(api_file_path, 'r') as file:
    api_keys = json.load(file)        
openai_gpt35 = ChatOpenAI(model_name="gpt-3.5-turbo", 
                                temperature=0.7,
                                openai_api_key=api_keys['openai'])
openai_gpt4 = ChatOpenAI(model_name="gpt-4", 
                                temperature=0.7,
                                openai_api_key=api_keys['openai'])        
llama31_8b = Ollama(model='llama3.1:8b',)
gemma2_9b = Ollama(model='gemma2:9b',)

In [31]:
cfg_file_path = 'casaai_config.yaml'
agent_cfg_path = 'agents.yaml'
task_cfg_path = 'tasks.yaml'

with open(cfg_file_path, 'r') as yaml_file:
    cfg = yaml.safe_load(yaml_file)
product_long = cfg.get('product_long_description', '')      
product_short = cfg.get('product_short_description', '')
domain = cfg.get('domain')
output_format_1 = cfg.get('output_format_1')

with open(agent_cfg_path, 'r') as yaml_file:
    agent_cfg_data = yaml.safe_load(yaml_file)

with open(task_cfg_path, 'r') as yaml_file:
    task_cfg_data = yaml.safe_load(yaml_file)

In [32]:
def re_string(input_string):

    pattern = r"'comment_id':\s*'([^']*)',\s*'rank':\s*(\d+)"

    # Use re.findall to extract the values
    matches = re.findall(pattern, input_string)

    if matches:
        comment_id, rank = matches[0]
    else:
        comment_id = "na"
        rank = 0

    justification = input_string.split("justification")[-1]
    
    return comment_id, rank, justification

In [33]:
backstory = agent_cfg_data['content_filter_analyst']['backstory'] 
goal = agent_cfg_data['content_filter_analyst']['goal']
role = agent_cfg_data['content_filter_analyst']['role'] 
content_filter_analyst = Agent(
                            role=role,
                            goal=goal,
                            backstory=backstory,
                            allow_delegation=False,
                            verbose=False,
                            llm=gemma2_9b,
                            )

In [34]:
class DecisionOutput(BaseModel):
    comment_id: str
    decision: str
    justification: str

In [35]:
tip_text = "If you do your BEST WORK, I'll give you a $100 commission!"

In [36]:
description = task_cfg_data['content_filter_task']['description']
expected_out = task_cfg_data['content_filter_task']['expected_out'] 
        
content_filter_task = Task(
                            description=description,
                            expected_output=expected_out,
                            output_json=DecisionOutput,
                            agent=content_filter_analyst,
                            )

In [37]:
response_creation_crew = Crew(
    agents=[content_filter_analyst,],
    tasks=[content_filter_task,],
    verbose=False,
)



In [10]:
reddit_posts, reddit_post_ids = fetch_reddit_test()
condensed_reddit_data, unique_post_ids, unique_comment_ids = condense_data(reddit_posts, reddit_post_ids)

post_cnt: 0 + 10 = 10
comm_cnt: 0 + 560 = 560
cond_cnt: 570 = 10 + 560
Cross_ck: 10 = 10


In [38]:
comment_dict = {}
tot_data = 0
for idx1, item in enumerate(condensed_reddit_data):
    for idx2, data in enumerate(item):
        tot_data += 1
        c_id = data['comment_id']
        p_id = data['parent_id']
        text = data['text']
        comment_dict[c_id] = {'comment_id': c_id, 'parent_id': p_id, 'text': text}
    print(idx1, ':', idx2)
comment_dict_list = list(comment_dict.keys())
print(len(comment_dict), '=', tot_data, len(comment_dict_list) )

0 : 169
1 : 108
2 : 31
3 : 57
4 : 0
5 : 33
6 : 67
7 : 60
8 : 1
9 : 34
570 = 570 570


In [39]:
def get_data_details(comm_id):
    done = 0
    data_details = []
    while done==0:
        if comm_id in comment_dict:
            details = comment_dict[comm_id]
            data_details.append(details)
            comm_id = details['parent_id']
        else:
            done = 1
    return data_details

In [42]:
decision_lst = []
for idx, reddit_data_item in enumerate(condensed_reddit_data):
    comment_lst = []
    for idx2, comment_data in enumerate(reddit_data_item):        
        comm_id = comment_data['comment_id']  
        comment_text = comment_data['text']
        print(f'STARTING {idx} - {idx2} - {comm_id}')   
        input_dict = {"comment_id": comm_id,
                      "input_data": comment_text,
                      "product_long": product_long,
                      "product_short": product_short,
                      "domain": domain,
                      "output_format":output_format_1, 
                      "tip_text":tip_text}
        decision_result = response_creation_crew.kickoff(inputs=input_dict)
        try:
            json_out = ast.literal_eval(decision_result.json)
        except:
            print('Exception in json - trying re')
            _, d, j = re_string(decision_result.raw)            
            json_out = {'comm_id': comm_id, 'decision': str(d), 'justification': str(j)}
        comment_lst.append(json_out)
    decision_lst.append(comment_lst)

STARTING 0 - 0 - kgesxn
[93m Pydantic validation error: 3 validation errors for DecisionOutput
comment_id
  Field required [type=missing, input_value={'kgesxn': {'decision': '...with no design query.'}}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.8/v/missing
decision
  Field required [type=missing, input_value={'kgesxn': {'decision': '...with no design query.'}}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.8/v/missing
justification
  Field required [type=missing, input_value={'kgesxn': {'decision': '...with no design query.'}}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.8/v/missing. Attempting to handle partial JSON.[00m
[93m Pydantic validation error: 3 validation errors for DecisionOutput
comment_id
  Field required [type=missing, input_value={'kgesxn': {'decision': '...with no design query.'}}, input_type=dict]
    For further information visit https://errors.pydantic.

In [43]:
analysis_lst = []
bad_list = []
tot_rel = 0
tot_comments = 0
relevant_json_list = []
discarded_json_list = []
for item in decision_lst:
    tot = 0
    rel = 0
    disc = 0   
    bad_cnt = 0
    sep = "^"
    for comment in item:
        tot += 1
        tot_comments += 1
        try:
            comm_key = list(comment.keys())[0]
            comm_id  = comment[comm_key].strip()
            if comm_id in comment_dict_list:                 
                parent_id = comment_dict[comm_id]["parent_id"]
                if parent_id is None:
                    parent_id = 'NA'
                decision = comment['decision']
                justify  = comment["justification"]
                comment_text = comment_dict[comm_id]["text"]
                text_str = sep + comm_id + sep + parent_id + sep + decision + sep + justify + sep + comment_text + sep
                if comment['decision'] == 'relevant':
                    rel += 1
                    tot_rel += 1
                    relevant_json_list.append(comment)
                else:
                    disc += 1
                    discarded_json_list.append(comment)
                analysis_lst.append(text_str)            
            else: 
                print(f'Comment ID not found : {comm_id}')
                bad_cnt += 1
                bad_list.append(comm_id)
        except:
            print(f'comment key error : {comment}')
            bad_cnt += 1
    print(f'tot : {tot}, rel : {rel}, discarded : {disc}, bad_cnt : {bad_cnt} analysis : {len(analysis_lst)}')
    print(f'total relevant : {tot_rel}, total comments : {tot_comments}')
    print(f'rel json count : {len(relevant_json_list)} discarded json count : {len(discarded_json_list)}')

Comment ID not found : ggggp581
Comment ID not found : gggbtvyv
Comment ID not found : ggy9lj
Comment ID not found : gggglvit
Comment ID not found : gglx92
Comment ID not found : gggmm3zh
tot : 170, rel : 51, discarded : 113, bad_cnt : 6 analysis : 164
total relevant : 51, total comments : 170
rel json count : 51 discarded json count : 113
Comment ID not found : str
tot : 109, rel : 48, discarded : 60, bad_cnt : 1 analysis : 272
total relevant : 99, total comments : 279
rel json count : 99 discarded json count : 173
tot : 32, rel : 11, discarded : 21, bad_cnt : 0 analysis : 304
total relevant : 110, total comments : 311
rel json count : 110 discarded json count : 194
tot : 58, rel : 27, discarded : 31, bad_cnt : 0 analysis : 362
total relevant : 137, total comments : 369
rel json count : 137 discarded json count : 225
tot : 1, rel : 1, discarded : 0, bad_cnt : 0 analysis : 363
total relevant : 138, total comments : 370
rel json count : 138 discarded json count : 225
Comment ID not foun

In [44]:
with open('decision_result_v5_0822B.txt', "w") as file:
    # Iterate through the list and write each string to the file
    for item in analysis_lst:
        file.write(item + "\n")  # Add a newline character after each string

print(f"List has been written")

List has been written


In [45]:
with open('decision_result_v5_0822B.txt', "r") as file:
    # Read all lines and strip the newline character from each line
    analysis_lst_loaded_back = [line.strip() for line in file.readlines()]
print(len(analysis_lst_loaded_back))

715


In [64]:
# For each post-comment combo select the post-comments whose score >= 7

In [None]:
# Then select a threshold like 20% or top 3 whichever is bigger of comments to craft response

In [None]:
# Build response for these comments With Manager LLM + only 1 response writer + meta reviewer